Home | History | Annotate | Download | only in intltest
      1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 ********************************************************************************
      5 *   Copyright (C) 1999-2016 International Business Machines Corporation and
      6 *   others. All Rights Reserved.
      7 ********************************************************************************
      8 *   Date        Name        Description
      9 *   10/20/99    alan        Creation.
     10 *   03/22/2000  Madhu       Added additional tests
     11 ********************************************************************************
     12 */
     13 
     14 #include <stdio.h>
     15 
     16 #include <string.h>
     17 #include "unicode/utypes.h"
     18 #include "usettest.h"
     19 #include "unicode/ucnv.h"
     20 #include "unicode/uniset.h"
     21 #include "unicode/uchar.h"
     22 #include "unicode/usetiter.h"
     23 #include "unicode/ustring.h"
     24 #include "unicode/parsepos.h"
     25 #include "unicode/symtable.h"
     26 #include "unicode/uversion.h"
     27 #include "cmemory.h"
     28 #include "hash.h"
     29 
     30 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
     31     dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
     32     u_errorName(status));}}
     33 
     34 #define TEST_ASSERT(expr) {if (!(expr)) { \
     35     dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); }}
     36 
     37 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
     38     UnicodeString pat;
     39     set.toPattern(pat);
     40     return left + UnicodeSetTest::escape(pat);
     41 }
     42 
     43 #define CASE(id,test) case id:                          \
     44                           name = #test;                 \
     45                           if (exec) {                   \
     46                               logln(#test "---");       \
     47                               logln();                  \
     48                               test();                   \
     49                           }                             \
     50                           break
     51 
     52 UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {
     53 }
     54 
     55 UConverter *UnicodeSetTest::openUTF8Converter() {
     56     if(utf8Cnv==NULL) {
     57         UErrorCode errorCode=U_ZERO_ERROR;
     58         utf8Cnv=ucnv_open("UTF-8", &errorCode);
     59     }
     60     return utf8Cnv;
     61 }
     62 
     63 UnicodeSetTest::~UnicodeSetTest() {
     64     ucnv_close(utf8Cnv);
     65 }
     66 
     67 void
     68 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
     69                                const char* &name, char* /*par*/) {
     70     // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");
     71     switch (index) {
     72         CASE(0,TestPatterns);
     73         CASE(1,TestAddRemove);
     74         CASE(2,TestCategories);
     75         CASE(3,TestCloneEqualHash);
     76         CASE(4,TestMinimalRep);
     77         CASE(5,TestAPI);
     78         CASE(6,TestScriptSet);
     79         CASE(7,TestPropertySet);
     80         CASE(8,TestClone);
     81         CASE(9,TestExhaustive);
     82         CASE(10,TestToPattern);
     83         CASE(11,TestIndexOf);
     84         CASE(12,TestStrings);
     85         CASE(13,Testj2268);
     86         CASE(14,TestCloseOver);
     87         CASE(15,TestEscapePattern);
     88         CASE(16,TestInvalidCodePoint);
     89         CASE(17,TestSymbolTable);
     90         CASE(18,TestSurrogate);
     91         CASE(19,TestPosixClasses);
     92         CASE(20,TestIteration);
     93         CASE(21,TestFreezable);
     94         CASE(22,TestSpan);
     95         CASE(23,TestStringSpan);
     96         CASE(24,TestUCAUnsafeBackwards);
     97         default: name = ""; break;
     98     }
     99 }
    100 
    101 static const char NOT[] = "%%%%";
    102 
    103 /**
    104  * UVector was improperly copying contents
    105  * This code will crash this is still true
    106  */
    107 void UnicodeSetTest::Testj2268() {
    108   UnicodeSet t;
    109   t.add(UnicodeString("abc"));
    110   UnicodeSet test(t);
    111   UnicodeString ustrPat;
    112   test.toPattern(ustrPat, TRUE);
    113 }
    114 
    115 /**
    116  * Test toPattern().
    117  */
    118 void UnicodeSetTest::TestToPattern() {
    119     UErrorCode ec = U_ZERO_ERROR;
    120 
    121     // Test that toPattern() round trips with syntax characters and
    122     // whitespace.
    123     {
    124         static const char* OTHER_TOPATTERN_TESTS[] = {
    125             "[[:latin:]&[:greek:]]",
    126             "[[:latin:]-[:greek:]]",
    127             "[:nonspacing mark:]",
    128             NULL
    129         };
    130 
    131         for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
    132             ec = U_ZERO_ERROR;
    133             UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
    134             if (U_FAILURE(ec)) {
    135                 dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec)));
    136                 continue;
    137             }
    138             checkPat(OTHER_TOPATTERN_TESTS[j], s);
    139         }
    140 
    141         for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
    142             if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
    143 
    144                 // check various combinations to make sure they all work.
    145                 if (i != 0 && !toPatternAux(i, i)){
    146                     continue;
    147                 }
    148                 if (!toPatternAux(0, i)){
    149                     continue;
    150                 }
    151                 if (!toPatternAux(i, 0xFFFF)){
    152                     continue;
    153                 }
    154             }
    155         }
    156     }
    157 
    158     // Test pattern behavior of multicharacter strings.
    159     {
    160         ec = U_ZERO_ERROR;
    161         UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
    162 
    163         // This loop isn't a loop.  It's here to make the compiler happy.
    164         // If you're curious, try removing it and changing the 'break'
    165         // statements (except for the last) to goto's.
    166         for (;;) {
    167             if (U_FAILURE(ec)) break;
    168             const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
    169             expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
    170 
    171             s->add("ac");
    172             const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
    173             expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
    174 
    175             s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec);
    176             if (U_FAILURE(ec)) break;
    177             const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
    178             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3);
    179 
    180             s->add("[]");
    181             const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
    182             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4);
    183 
    184             s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec);
    185             if (U_FAILURE(ec)) break;
    186             const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
    187             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5);
    188 
    189             // j2189
    190             s->clear();
    191             s->add(UnicodeString("abc", ""));
    192             s->add(UnicodeString("abc", ""));
    193             const char* exp6[] = {"abc", NOT, "ab", NULL};
    194             expectToPattern(*s, "[{abc}]", exp6);
    195 
    196             break;
    197         }
    198 
    199         if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
    200         delete s;
    201     }
    202 
    203     // JB#3400: For 2 character ranges prefer [ab] to [a-b]
    204     UnicodeSet s;
    205     s.add((UChar)97, (UChar)98); // 'a', 'b'
    206     expectToPattern(s, "[ab]", NULL);
    207 }
    208 
    209 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
    210 
    211     // use Integer.toString because Utility.hex doesn't handle ints
    212     UnicodeString pat = "";
    213     // TODO do these in hex
    214     //String source = "0x" + Integer.toString(start,16).toUpperCase();
    215     //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
    216     UnicodeString source;
    217     source = source + (uint32_t)start;
    218     if (start != end)
    219         source = source + ".." + (uint32_t)end;
    220     UnicodeSet testSet;
    221     testSet.add(start, end);
    222     return checkPat(source, testSet);
    223 }
    224 
    225 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
    226                                const UnicodeSet& testSet) {
    227     // What we want to make sure of is that a pattern generated
    228     // by toPattern(), with or without escaped unprintables, can
    229     // be passed back into the UnicodeSet constructor.
    230     UnicodeString pat0;
    231 
    232     testSet.toPattern(pat0, TRUE);
    233 
    234     if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
    235 
    236     //String pat1 = unescapeLeniently(pat0);
    237     //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
    238 
    239     UnicodeString pat2;
    240     testSet.toPattern(pat2, FALSE);
    241     if (!checkPat(source, testSet, pat2)) return FALSE;
    242 
    243     //String pat3 = unescapeLeniently(pat2);
    244     // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
    245 
    246     //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
    247     logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
    248     return TRUE;
    249 }
    250 
    251 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
    252                                const UnicodeSet& testSet,
    253                                const UnicodeString& pat) {
    254     UErrorCode ec = U_ZERO_ERROR;
    255     UnicodeSet testSet2(pat, ec);
    256     if (testSet2 != testSet) {
    257         errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
    258         return FALSE;
    259     }
    260     return TRUE;
    261 }
    262 
    263 void
    264 UnicodeSetTest::TestPatterns(void) {
    265     UnicodeSet set;
    266     expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""),  "km");
    267     expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""),  "aczz");
    268     expectPattern(set, UnicodeString("[a\\-z]", ""),  "--aazz");
    269     expectPattern(set, UnicodeString("[-az]", ""),  "--aazz");
    270     expectPattern(set, UnicodeString("[az-]", ""),  "--aazz");
    271     expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
    272 
    273     // Throw in a test of complement
    274     set.complement();
    275     UnicodeString exp;
    276     exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF);
    277     expectPairs(set, exp);
    278 }
    279 
    280 void
    281 UnicodeSetTest::TestCategories(void) {
    282     UErrorCode status = U_ZERO_ERROR;
    283     const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
    284     UnicodeSet set(pat, status);
    285     if (U_FAILURE(status)) {
    286         dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status)));
    287         return;
    288     } else {
    289         expectContainment(set, pat, "ABC", "abc");
    290     }
    291 
    292     UChar32 i;
    293     int32_t failures = 0;
    294     // Make sure generation of L doesn't pollute cached Lu set
    295     // First generate L, then Lu
    296     set.applyPattern("[:L:]", status);
    297     if (U_FAILURE(status)) { errln("FAIL"); return; }
    298     for (i=0; i<0x200; ++i) {
    299         UBool l = u_isalpha((UChar)i);
    300         if (l != set.contains(i)) {
    301             errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
    302                   set.contains(i));
    303             if (++failures == 10) break;
    304         }
    305     }
    306 
    307     set.applyPattern("[:Lu:]", status);
    308     if (U_FAILURE(status)) { errln("FAIL"); return; }
    309     for (i=0; i<0x200; ++i) {
    310         UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
    311         if (lu != set.contains(i)) {
    312             errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
    313                   set.contains(i));
    314             if (++failures == 20) break;
    315         }
    316     }
    317 }
    318 void
    319 UnicodeSetTest::TestCloneEqualHash(void) {
    320     UErrorCode status = U_ZERO_ERROR;
    321     // set1 and set2 used to be built with the obsolete constructor taking
    322     // UCharCategory values; replaced with pattern constructors
    323     // markus 20030502
    324     UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status); //  :Ll: Letter, lowercase
    325     UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); //  Letter, lowercase
    326     if (U_FAILURE(status)){
    327         dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status)));
    328         return;
    329     }
    330     UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status);   //Number, Decimal digit
    331     UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status);   //Number, Decimal digit
    332     if (U_FAILURE(status)){
    333         errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
    334         return;
    335     }
    336 
    337     if (*set1 != *set1a) {
    338         errln("FAIL: category constructor for Ll broken");
    339     }
    340     if (*set2 != *set2a) {
    341         errln("FAIL: category constructor for Nd broken");
    342     }
    343     delete set1a;
    344     delete set2a;
    345 
    346     logln("Testing copy construction");
    347     UnicodeSet *set1copy=new UnicodeSet(*set1);
    348     if(*set1 != *set1copy || *set1 == *set2 ||
    349         getPairs(*set1) != getPairs(*set1copy) ||
    350         set1->hashCode() != set1copy->hashCode()){
    351         errln("FAIL : Error in copy construction");
    352         return;
    353     }
    354 
    355     logln("Testing =operator");
    356     UnicodeSet set1equal=*set1;
    357     UnicodeSet set2equal=*set2;
    358     if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
    359         set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
    360         errln("FAIL: Error in =operator");
    361     }
    362 
    363     logln("Testing clone()");
    364     UnicodeSet *set1clone=(UnicodeSet*)set1->clone();
    365     UnicodeSet *set2clone=(UnicodeSet*)set2->clone();
    366     if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
    367         *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
    368         *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
    369         errln("FAIL: Error in clone");
    370     }
    371 
    372     logln("Testing hashcode");
    373     if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
    374         set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
    375         set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
    376         set1->hashCode() == set2->hashCode()  || set1copy->hashCode() == set2->hashCode() ||
    377         set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
    378         errln("FAIL: Error in hashCode()");
    379     }
    380 
    381     delete set1;
    382     delete set1copy;
    383     delete set2;
    384     delete set1clone;
    385     delete set2clone;
    386 
    387 
    388 }
    389 void
    390 UnicodeSetTest::TestAddRemove(void) {
    391     UnicodeSet set; // Construct empty set
    392     doAssert(set.isEmpty() == TRUE, "set should be empty");
    393     doAssert(set.size() == 0, "size should be 0");
    394     set.complement();
    395     doAssert(set.size() == 0x110000, "size should be 0x110000");
    396     set.clear();
    397     set.add(0x0061, 0x007a);
    398     expectPairs(set, "az");
    399     doAssert(set.isEmpty() == FALSE, "set should not be empty");
    400     doAssert(set.size() != 0, "size should not be equal to 0");
    401     doAssert(set.size() == 26, "size should be equal to 26");
    402     set.remove(0x006d, 0x0070);
    403     expectPairs(set, "alqz");
    404     doAssert(set.size() == 22, "size should be equal to 22");
    405     set.remove(0x0065, 0x0067);
    406     expectPairs(set, "adhlqz");
    407     doAssert(set.size() == 19, "size should be equal to 19");
    408     set.remove(0x0064, 0x0069);
    409     expectPairs(set, "acjlqz");
    410     doAssert(set.size() == 16, "size should be equal to 16");
    411     set.remove(0x0063, 0x0072);
    412     expectPairs(set, "absz");
    413     doAssert(set.size() == 10, "size should be equal to 10");
    414     set.add(0x0066, 0x0071);
    415     expectPairs(set, "abfqsz");
    416     doAssert(set.size() == 22, "size should be equal to 22");
    417     set.remove(0x0061, 0x0067);
    418     expectPairs(set, "hqsz");
    419     set.remove(0x0061, 0x007a);
    420     expectPairs(set, "");
    421     doAssert(set.isEmpty() == TRUE, "set should be empty");
    422     doAssert(set.size() == 0, "size should be 0");
    423     set.add(0x0061);
    424     doAssert(set.isEmpty() == FALSE, "set should not be empty");
    425     doAssert(set.size() == 1, "size should not be equal to 1");
    426     set.add(0x0062);
    427     set.add(0x0063);
    428     expectPairs(set, "ac");
    429     doAssert(set.size() == 3, "size should not be equal to 3");
    430     set.add(0x0070);
    431     set.add(0x0071);
    432     expectPairs(set, "acpq");
    433     doAssert(set.size() == 5, "size should not be equal to 5");
    434     set.clear();
    435     expectPairs(set, "");
    436     doAssert(set.isEmpty() == TRUE, "set should be empty");
    437     doAssert(set.size() == 0, "size should be 0");
    438 
    439     // Try removing an entire set from another set
    440     expectPattern(set, "[c-x]", "cx");
    441     UnicodeSet set2;
    442     expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
    443     set.removeAll(set2);
    444     expectPairs(set, "deluxx");
    445 
    446     // Try adding an entire set to another set
    447     expectPattern(set, "[jackiemclean]", "aacceein");
    448     expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
    449     set.addAll(set2);
    450     expectPairs(set, "aacehort");
    451     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
    452 
    453     // Try retaining an set of elements contained in another set (intersection)
    454     UnicodeSet set3;
    455     expectPattern(set3, "[a-c]", "ac");
    456     doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
    457     set3.remove(0x0062);
    458     expectPairs(set3, "aacc");
    459     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
    460     set.retainAll(set3);
    461     expectPairs(set, "aacc");
    462     doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
    463     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
    464     set.clear();
    465     doAssert(set.size() != set3.size(), "set.size() != set3.size()");
    466 
    467     // Test commutativity
    468     expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
    469     expectPattern(set2, "[jackiemclean]", "aacceein");
    470     set.addAll(set2);
    471     expectPairs(set, "aacehort");
    472     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
    473 
    474 
    475 
    476 
    477 }
    478 
    479 /**
    480  * Make sure minimal representation is maintained.
    481  */
    482 void UnicodeSetTest::TestMinimalRep() {
    483     UErrorCode status = U_ZERO_ERROR;
    484     // This is pretty thoroughly tested by checkCanonicalRep()
    485     // run against the exhaustive operation results.  Use the code
    486     // here for debugging specific spot problems.
    487 
    488     // 1 overlap against 2
    489     UnicodeSet set("[h-km-q]", status);
    490     if (U_FAILURE(status)) { errln("FAIL"); return; }
    491     UnicodeSet set2("[i-o]", status);
    492     if (U_FAILURE(status)) { errln("FAIL"); return; }
    493     set.addAll(set2);
    494     expectPairs(set, "hq");
    495     // right
    496     set.applyPattern("[a-m]", status);
    497     if (U_FAILURE(status)) { errln("FAIL"); return; }
    498     set2.applyPattern("[e-o]", status);
    499     if (U_FAILURE(status)) { errln("FAIL"); return; }
    500     set.addAll(set2);
    501     expectPairs(set, "ao");
    502     // left
    503     set.applyPattern("[e-o]", status);
    504     if (U_FAILURE(status)) { errln("FAIL"); return; }
    505     set2.applyPattern("[a-m]", status);
    506     if (U_FAILURE(status)) { errln("FAIL"); return; }
    507     set.addAll(set2);
    508     expectPairs(set, "ao");
    509     // 1 overlap against 3
    510     set.applyPattern("[a-eg-mo-w]", status);
    511     if (U_FAILURE(status)) { errln("FAIL"); return; }
    512     set2.applyPattern("[d-q]", status);
    513     if (U_FAILURE(status)) { errln("FAIL"); return; }
    514     set.addAll(set2);
    515     expectPairs(set, "aw");
    516 }
    517 
    518 void UnicodeSetTest::TestAPI() {
    519     UErrorCode status = U_ZERO_ERROR;
    520     // default ct
    521     UnicodeSet set;
    522     if (!set.isEmpty() || set.getRangeCount() != 0) {
    523         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
    524               set);
    525     }
    526 
    527     // clear(), isEmpty()
    528     set.add(0x0061);
    529     if (set.isEmpty()) {
    530         errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
    531               set);
    532     }
    533     set.clear();
    534     if (!set.isEmpty()) {
    535         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
    536               set);
    537     }
    538 
    539     // size()
    540     set.clear();
    541     if (set.size() != 0) {
    542         errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
    543               ": " + set);
    544     }
    545     set.add(0x0061);
    546     if (set.size() != 1) {
    547         errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
    548               ": " + set);
    549     }
    550     set.add(0x0031, 0x0039);
    551     if (set.size() != 10) {
    552         errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
    553               ": " + set);
    554     }
    555 
    556     // contains(first, last)
    557     set.clear();
    558     set.applyPattern("[A-Y 1-8 b-d l-y]", status);
    559     if (U_FAILURE(status)) { errln("FAIL"); return; }
    560     for (int32_t i = 0; i<set.getRangeCount(); ++i) {
    561         UChar32 a = set.getRangeStart(i);
    562         UChar32 b = set.getRangeEnd(i);
    563         if (!set.contains(a, b)) {
    564             errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
    565                   " but doesn't: " + set);
    566         }
    567         if (set.contains((UChar32)(a-1), b)) {
    568             errln((UnicodeString)"FAIL, shouldn't contain " +
    569                   (unsigned short)(a-1) + '-' + (unsigned short)b +
    570                   " but does: " + set);
    571         }
    572         if (set.contains(a, (UChar32)(b+1))) {
    573             errln((UnicodeString)"FAIL, shouldn't contain " +
    574                   (unsigned short)a + '-' + (unsigned short)(b+1) +
    575                   " but does: " + set);
    576         }
    577     }
    578 
    579     // Ported InversionList test.
    580     UnicodeSet a((UChar32)3,(UChar32)10);
    581     UnicodeSet b((UChar32)7,(UChar32)15);
    582     UnicodeSet c;
    583 
    584     logln((UnicodeString)"a [3-10]: " + a);
    585     logln((UnicodeString)"b [7-15]: " + b);
    586     c = a;
    587     c.addAll(b);
    588     UnicodeSet exp((UChar32)3,(UChar32)15);
    589     if (c == exp) {
    590         logln((UnicodeString)"c.set(a).add(b): " + c);
    591     } else {
    592         errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
    593     }
    594     c.complement();
    595     exp.set((UChar32)0, (UChar32)2);
    596     exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
    597     if (c == exp) {
    598         logln((UnicodeString)"c.complement(): " + c);
    599     } else {
    600         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
    601     }
    602     c.complement();
    603     exp.set((UChar32)3, (UChar32)15);
    604     if (c == exp) {
    605         logln((UnicodeString)"c.complement(): " + c);
    606     } else {
    607         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
    608     }
    609     c = a;
    610     c.complementAll(b);
    611     exp.set((UChar32)3,(UChar32)6);
    612     exp.add((UChar32)11,(UChar32) 15);
    613     if (c == exp) {
    614         logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
    615     } else {
    616         errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
    617     }
    618 
    619     exp = c;
    620     bitsToSet(setToBits(c), c);
    621     if (c == exp) {
    622         logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
    623     } else {
    624         errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
    625     }
    626 
    627     // Additional tests for coverage JB#2118
    628     //UnicodeSet::complement(class UnicodeString const &)
    629     //UnicodeSet::complementAll(class UnicodeString const &)
    630     //UnicodeSet::containsNone(class UnicodeSet const &)
    631     //UnicodeSet::containsNone(long,long)
    632     //UnicodeSet::containsSome(class UnicodeSet const &)
    633     //UnicodeSet::containsSome(long,long)
    634     //UnicodeSet::removeAll(class UnicodeString const &)
    635     //UnicodeSet::retain(long)
    636     //UnicodeSet::retainAll(class UnicodeString const &)
    637     //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
    638     //UnicodeSetIterator::getString(void)
    639     set.clear();
    640     set.complement("ab");
    641     exp.applyPattern("[{ab}]", status);
    642     if (U_FAILURE(status)) { errln("FAIL"); return; }
    643     if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
    644 
    645     UnicodeSetIterator iset(set);
    646     if (!iset.next() || !iset.isString()) {
    647         errln("FAIL: UnicodeSetIterator::next/isString");
    648     } else if (iset.getString() != "ab") {
    649         errln("FAIL: UnicodeSetIterator::getString");
    650     }
    651 
    652     set.add((UChar32)0x61, (UChar32)0x7A);
    653     set.complementAll("alan");
    654     exp.applyPattern("[{ab}b-kmo-z]", status);
    655     if (U_FAILURE(status)) { errln("FAIL"); return; }
    656     if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
    657 
    658     exp.applyPattern("[a-z]", status);
    659     if (U_FAILURE(status)) { errln("FAIL"); return; }
    660     if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
    661     if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
    662     exp.applyPattern("[aln]", status);
    663     if (U_FAILURE(status)) { errln("FAIL"); return; }
    664     if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
    665     if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
    666 
    667     if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {
    668         errln("FAIL: containsNone(UChar32, UChar32)");
    669     }
    670     if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {
    671         errln("FAIL: containsSome(UChar32, UChar32)");
    672     }
    673     if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {
    674         errln("FAIL: containsNone(UChar32, UChar32)");
    675     }
    676     if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {
    677         errln("FAIL: containsSome(UChar32, UChar32)");
    678     }
    679 
    680     set.removeAll("liu");
    681     exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
    682     if (U_FAILURE(status)) { errln("FAIL"); return; }
    683     if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
    684 
    685     set.retainAll("star");
    686     exp.applyPattern("[rst]", status);
    687     if (U_FAILURE(status)) { errln("FAIL"); return; }
    688     if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
    689 
    690     set.retain((UChar32)0x73);
    691     exp.applyPattern("[s]", status);
    692     if (U_FAILURE(status)) { errln("FAIL"); return; }
    693     if (set != exp) { errln("FAIL: retain('s')"); return; }
    694 
    695     uint16_t buf[32];
    696     int32_t slen = set.serialize(buf, UPRV_LENGTHOF(buf), status);
    697     if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
    698     if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) {
    699         errln("FAIL: serialize");
    700         return;
    701     }
    702 
    703     // Conversions to and from USet
    704     UnicodeSet *uniset = &set;
    705     USet *uset = uniset->toUSet();
    706     TEST_ASSERT((void *)uset == (void *)uniset);
    707     UnicodeSet *setx = UnicodeSet::fromUSet(uset);
    708     TEST_ASSERT((void *)setx == (void *)uset);
    709     const UnicodeSet *constSet = uniset;
    710     const USet *constUSet = constSet->toUSet();
    711     TEST_ASSERT((void *)constUSet == (void *)constSet);
    712     const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);
    713     TEST_ASSERT((void *)constSetx == (void *)constUSet);
    714 
    715     // span(UnicodeString) and spanBack(UnicodeString) convenience methods
    716     UnicodeString longString=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccccc");
    717     UnicodeSet ac(0x61, 0x63);
    718     ac.remove(0x62).freeze();
    719     if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 ||
    720         ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 ||
    721         ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 ||
    722         ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 ||
    723         ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 ||
    724         ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 ||
    725         ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 ||
    726         ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 ||
    727         ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 ||
    728         ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30
    729     ) {
    730         errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes");
    731     }
    732     if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 ||
    733         ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 ||
    734         ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 ||
    735         ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 ||
    736         ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 ||
    737         ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 ||
    738         ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 ||
    739         ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 ||
    740         ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 ||
    741         ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20
    742     ) {
    743         errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes");
    744     }
    745 }
    746 
    747 void UnicodeSetTest::TestIteration() {
    748     UErrorCode ec = U_ZERO_ERROR;
    749     int i = 0;
    750     int outerLoop;
    751 
    752     // 6 code points, 3 ranges, 2 strings, 8 total elements
    753     //   Iteration will access them in sorted order -  a, b, c, y, z, U0001abcd, "str1", "str2"
    754     UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec);
    755     TEST_ASSERT_SUCCESS(ec);
    756     UnicodeSetIterator it(set);
    757 
    758     for (outerLoop=0; outerLoop<3; outerLoop++) {
    759         // Run the test multiple times, to check that iterator.reset() is working.
    760         for (i=0; i<10; i++) {
    761             UBool         nextv        = it.next();
    762             UBool         isString     = it.isString();
    763             int32_t       codePoint    = it.getCodepoint();
    764             //int32_t       codePointEnd = it.getCodepointEnd();
    765             UnicodeString s   = it.getString();
    766             switch (i) {
    767             case 0:
    768                 TEST_ASSERT(nextv == TRUE);
    769                 TEST_ASSERT(isString == FALSE);
    770                 TEST_ASSERT(codePoint==0x61);
    771                 TEST_ASSERT(s == "a");
    772                 break;
    773             case 1:
    774                 TEST_ASSERT(nextv == TRUE);
    775                 TEST_ASSERT(isString == FALSE);
    776                 TEST_ASSERT(codePoint==0x62);
    777                 TEST_ASSERT(s == "b");
    778                 break;
    779             case 2:
    780                 TEST_ASSERT(nextv == TRUE);
    781                 TEST_ASSERT(isString == FALSE);
    782                 TEST_ASSERT(codePoint==0x63);
    783                 TEST_ASSERT(s == "c");
    784                 break;
    785             case 3:
    786                 TEST_ASSERT(nextv == TRUE);
    787                 TEST_ASSERT(isString == FALSE);
    788                 TEST_ASSERT(codePoint==0x79);
    789                 TEST_ASSERT(s == "y");
    790                 break;
    791             case 4:
    792                 TEST_ASSERT(nextv == TRUE);
    793                 TEST_ASSERT(isString == FALSE);
    794                 TEST_ASSERT(codePoint==0x7a);
    795                 TEST_ASSERT(s == "z");
    796                 break;
    797             case 5:
    798                 TEST_ASSERT(nextv == TRUE);
    799                 TEST_ASSERT(isString == FALSE);
    800                 TEST_ASSERT(codePoint==0x1abcd);
    801                 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
    802                 break;
    803             case 6:
    804                 TEST_ASSERT(nextv == TRUE);
    805                 TEST_ASSERT(isString == TRUE);
    806                 TEST_ASSERT(s == "str1");
    807                 break;
    808             case 7:
    809                 TEST_ASSERT(nextv == TRUE);
    810                 TEST_ASSERT(isString == TRUE);
    811                 TEST_ASSERT(s == "str2");
    812                 break;
    813             case 8:
    814                 TEST_ASSERT(nextv == FALSE);
    815                 break;
    816             case 9:
    817                 TEST_ASSERT(nextv == FALSE);
    818                 break;
    819             }
    820         }
    821         it.reset();  // prepare to run the iteration again.
    822     }
    823 }
    824 
    825 
    826 
    827 
    828 void UnicodeSetTest::TestStrings() {
    829     UErrorCode ec = U_ZERO_ERROR;
    830 
    831     UnicodeSet* testList[] = {
    832         UnicodeSet::createFromAll("abc"),
    833         new UnicodeSet("[a-c]", ec),
    834 
    835         &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
    836         new UnicodeSet("[{ll}{ch}a-z]", ec),
    837 
    838         UnicodeSet::createFrom("ab}c"),
    839         new UnicodeSet("[{ab\\}c}]", ec),
    840 
    841         &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
    842         new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
    843 
    844         NULL
    845     };
    846 
    847     if (U_FAILURE(ec)) {
    848         errln("FAIL: couldn't construct test sets");
    849     }
    850 
    851     for (int32_t i = 0; testList[i] != NULL; i+=2) {
    852         if (U_SUCCESS(ec)) {
    853             UnicodeString pat0, pat1;
    854             testList[i]->toPattern(pat0, TRUE);
    855             testList[i+1]->toPattern(pat1, TRUE);
    856             if (*testList[i] == *testList[i+1]) {
    857                 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
    858             } else {
    859                 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
    860             }
    861         }
    862         delete testList[i];
    863         delete testList[i+1];
    864     }
    865 }
    866 
    867 /**
    868  * Test the [:Latin:] syntax.
    869  */
    870 void UnicodeSetTest::TestScriptSet() {
    871     expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1"));
    872 
    873     expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA");
    874 
    875     /* Jitterbug 1423 */
    876     expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
    877 
    878 }
    879 
    880 /**
    881  * Test the [:Latin:] syntax.
    882  */
    883 void UnicodeSetTest::TestPropertySet() {
    884     static const char* const DATA[] = {
    885         // Pattern, Chars IN, Chars NOT in
    886 
    887         "[:Latin:]",
    888         "aA",
    889         "\\u0391\\u03B1",
    890 
    891         "[\\p{Greek}]",
    892         "\\u0391\\u03B1",
    893         "aA",
    894 
    895         "\\P{ GENERAL Category = upper case letter }",
    896         "abc",
    897         "ABC",
    898 
    899 #if !UCONFIG_NO_NORMALIZATION
    900         // Combining class: @since ICU 2.2
    901         // Check both symbolic and numeric
    902         "\\p{ccc=Nukta}",
    903         "\\u0ABC",
    904         "abc",
    905 
    906         "\\p{Canonical Combining Class = 11}",
    907         "\\u05B1",
    908         "\\u05B2",
    909 
    910         "[:c c c = iota subscript :]",
    911         "\\u0345",
    912         "xyz",
    913 #endif
    914 
    915         // Bidi class: @since ICU 2.2
    916         "\\p{bidiclass=lefttoright}",
    917         "abc",
    918         "\\u0671\\u0672",
    919 
    920         // Binary properties: @since ICU 2.2
    921         "\\p{ideographic}",
    922         "\\u4E0A",
    923         "x",
    924 
    925         "[:math=false:]",
    926         "q)*(",
    927         // weiv: )(and * were removed from math in Unicode 4.0.1
    928         //"(*+)",
    929         "+<>^",
    930 
    931         // JB#1767 \N{}, \p{ASCII}
    932         "[:Ascii:]",
    933         "abc\\u0000\\u007F",
    934         "\\u0080\\u4E00",
    935 
    936         "[\\N{ latin small letter  a  }[:name= latin small letter z:]]",
    937         "az",
    938         "qrs",
    939 
    940         // JB#2015
    941         "[:any:]",
    942         "a\\U0010FFFF",
    943         "",
    944 
    945         "[:nv=0.5:]",
    946         "\\u00BD\\u0F2A",
    947         "\\u00BC",
    948 
    949         // JB#2653: Age
    950         "[:Age=1.1:]",
    951         "\\u03D6", // 1.1
    952         "\\u03D8\\u03D9", // 3.2
    953 
    954         "[:Age=3.1:]",
    955         "\\u1800\\u3400\\U0002f800",
    956         "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
    957 
    958         // JB#2350: Case_Sensitive
    959         "[:Case Sensitive:]",
    960         "A\\u1FFC\\U00010410",
    961         ";\\u00B4\\U00010500",
    962 
    963         // JB#2832: C99-compatibility props
    964         "[:blank:]",
    965         " \\u0009",
    966         "1-9A-Z",
    967 
    968         "[:graph:]",
    969         "19AZ",
    970         " \\u0003\\u0007\\u0009\\u000A\\u000D",
    971 
    972         "[:punct:]",
    973         "!@#%&*()[]{}-_\\/;:,.?'\"",
    974         "09azAZ",
    975 
    976         "[:xdigit:]",
    977         "09afAF",
    978         "gG!",
    979 
    980         // Regex compatibility test
    981         "[-b]", // leading '-' is literal
    982         "-b",
    983         "ac",
    984 
    985         "[^-b]", // leading '-' is literal
    986         "ac",
    987         "-b",
    988 
    989         "[b-]", // trailing '-' is literal
    990         "-b",
    991         "ac",
    992 
    993         "[^b-]", // trailing '-' is literal
    994         "ac",
    995         "-b",
    996 
    997         "[a-b-]", // trailing '-' is literal
    998         "ab-",
    999         "c=",
   1000 
   1001         "[[a-q]&[p-z]-]", // trailing '-' is literal
   1002         "pq-",
   1003         "or=",
   1004 
   1005         "[\\s|\\)|:|$|\\>]", // from regex tests
   1006         "s|):$>",
   1007         "abc",
   1008 
   1009         "[\\uDC00cd]", // JB#2906: isolated trail at start
   1010         "cd\\uDC00",
   1011         "ab\\uD800\\U00010000",
   1012 
   1013         "[ab\\uD800]", // JB#2906: isolated trail at start
   1014         "ab\\uD800",
   1015         "cd\\uDC00\\U00010000",
   1016 
   1017         "[ab\\uD800cd]", // JB#2906: isolated lead in middle
   1018         "abcd\\uD800",
   1019         "ef\\uDC00\\U00010000",
   1020 
   1021         "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
   1022         "abcd\\uDC00",
   1023         "ef\\uD800\\U00010000",
   1024 
   1025 #if !UCONFIG_NO_NORMALIZATION
   1026         "[:^lccc=0:]", // Lead canonical class
   1027         "\\u0300\\u0301",
   1028         "abcd\\u00c0\\u00c5",
   1029 
   1030         "[:^tccc=0:]", // Trail canonical class
   1031         "\\u0300\\u0301\\u00c0\\u00c5",
   1032         "abcd",
   1033 
   1034         "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
   1035         "\\u0300\\u0301\\u00c0\\u00c5",
   1036         "abcd",
   1037 
   1038         "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
   1039         "",
   1040         "abcd\\u0300\\u0301\\u00c0\\u00c5",
   1041 
   1042         "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
   1043         "\\u0F73\\u0F75\\u0F81",
   1044         "abcd\\u0300\\u0301\\u00c0\\u00c5",
   1045 #endif /* !UCONFIG_NO_NORMALIZATION */
   1046 
   1047         "[:Assigned:]",
   1048         "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
   1049         "\\u0888\\uFDD3\\uFFFE\\U00050005",
   1050 
   1051         // Script_Extensions, new in Unicode 6.0
   1052         "[:scx=Arab:]",
   1053         "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3",
   1054         "\\u061D\\uFDEF\\uFDFE",
   1055 
   1056         // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions,
   1057         // so scx-sc is missing U+FDF2.
   1058         "[[:Script_Extensions=Arabic:]-[:Arab:]]",
   1059         "\\u0640\\u064B\\u0650\\u0655",
   1060         "\\uFDF2"
   1061     };
   1062 
   1063     static const int32_t DATA_LEN = UPRV_LENGTHOF(DATA);
   1064 
   1065     for (int32_t i=0; i<DATA_LEN; i+=3) {
   1066         expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
   1067                           CharsToUnicodeString(DATA[i+2]));
   1068     }
   1069 }
   1070 
   1071 /**
   1072   * Test that Posix style character classes [:digit:], etc.
   1073   *   have the Unicode definitions from TR 18.
   1074   */
   1075 void UnicodeSetTest::TestPosixClasses() {
   1076     {
   1077         UErrorCode status = U_ZERO_ERROR;
   1078         UnicodeSet s1("[:alpha:]", status);
   1079         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status);
   1080         TEST_ASSERT_SUCCESS(status);
   1081         TEST_ASSERT(s1==s2);
   1082     }
   1083     {
   1084         UErrorCode status = U_ZERO_ERROR;
   1085         UnicodeSet s1("[:lower:]", status);
   1086         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status);
   1087         TEST_ASSERT_SUCCESS(status);
   1088         TEST_ASSERT(s1==s2);
   1089     }
   1090     {
   1091         UErrorCode status = U_ZERO_ERROR;
   1092         UnicodeSet s1("[:upper:]", status);
   1093         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status);
   1094         TEST_ASSERT_SUCCESS(status);
   1095         TEST_ASSERT(s1==s2);
   1096     }
   1097     {
   1098         UErrorCode status = U_ZERO_ERROR;
   1099         UnicodeSet s1("[:punct:]", status);
   1100         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status);
   1101         TEST_ASSERT_SUCCESS(status);
   1102         TEST_ASSERT(s1==s2);
   1103     }
   1104     {
   1105         UErrorCode status = U_ZERO_ERROR;
   1106         UnicodeSet s1("[:digit:]", status);
   1107         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status);
   1108         TEST_ASSERT_SUCCESS(status);
   1109         TEST_ASSERT(s1==s2);
   1110     }
   1111     {
   1112         UErrorCode status = U_ZERO_ERROR;
   1113         UnicodeSet s1("[:xdigit:]", status);
   1114         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status);
   1115         TEST_ASSERT_SUCCESS(status);
   1116         TEST_ASSERT(s1==s2);
   1117     }
   1118     {
   1119         UErrorCode status = U_ZERO_ERROR;
   1120         UnicodeSet s1("[:alnum:]", status);
   1121         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status);
   1122         TEST_ASSERT_SUCCESS(status);
   1123         TEST_ASSERT(s1==s2);
   1124     }
   1125     {
   1126         UErrorCode status = U_ZERO_ERROR;
   1127         UnicodeSet s1("[:space:]", status);
   1128         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status);
   1129         TEST_ASSERT_SUCCESS(status);
   1130         TEST_ASSERT(s1==s2);
   1131     }
   1132     {
   1133         UErrorCode status = U_ZERO_ERROR;
   1134         UnicodeSet s1("[:blank:]", status);
   1135         TEST_ASSERT_SUCCESS(status);
   1136         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"),
   1137             status);
   1138         TEST_ASSERT_SUCCESS(status);
   1139         TEST_ASSERT(s1==s2);
   1140     }
   1141     {
   1142         UErrorCode status = U_ZERO_ERROR;
   1143         UnicodeSet s1("[:cntrl:]", status);
   1144         TEST_ASSERT_SUCCESS(status);
   1145         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status);
   1146         TEST_ASSERT_SUCCESS(status);
   1147         TEST_ASSERT(s1==s2);
   1148     }
   1149     {
   1150         UErrorCode status = U_ZERO_ERROR;
   1151         UnicodeSet s1("[:graph:]", status);
   1152         TEST_ASSERT_SUCCESS(status);
   1153         UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status);
   1154         TEST_ASSERT_SUCCESS(status);
   1155         TEST_ASSERT(s1==s2);
   1156     }
   1157     {
   1158         UErrorCode status = U_ZERO_ERROR;
   1159         UnicodeSet s1("[:print:]", status);
   1160         TEST_ASSERT_SUCCESS(status);
   1161         UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status);
   1162         TEST_ASSERT_SUCCESS(status);
   1163         TEST_ASSERT(s1==s2);
   1164     }
   1165 }
   1166 /**
   1167  * Test cloning of UnicodeSet.  For C++, we test the copy constructor.
   1168  */
   1169 void UnicodeSetTest::TestClone() {
   1170     UErrorCode ec = U_ZERO_ERROR;
   1171     UnicodeSet s("[abcxyz]", ec);
   1172     UnicodeSet t(s);
   1173     expectContainment(t, "abc", "def");
   1174 }
   1175 
   1176 /**
   1177  * Test the indexOf() and charAt() methods.
   1178  */
   1179 void UnicodeSetTest::TestIndexOf() {
   1180     UErrorCode ec = U_ZERO_ERROR;
   1181     UnicodeSet set("[a-cx-y3578]", ec);
   1182     if (U_FAILURE(ec)) {
   1183         errln("FAIL: UnicodeSet constructor");
   1184         return;
   1185     }
   1186     for (int32_t i=0; i<set.size(); ++i) {
   1187         UChar32 c = set.charAt(i);
   1188         if (set.indexOf(c) != i) {
   1189             errln("FAIL: charAt(%d) = %X => indexOf() => %d",
   1190                 i, c, set.indexOf(c));
   1191         }
   1192     }
   1193     UChar32 c = set.charAt(set.size());
   1194     if (c != -1) {
   1195         errln("FAIL: charAt(<out of range>) = %X", c);
   1196     }
   1197     int32_t j = set.indexOf((UChar32)0x71/*'q'*/);
   1198     if (j != -1) {
   1199         errln((UnicodeString)"FAIL: indexOf('q') = " + j);
   1200     }
   1201 }
   1202 
   1203 /**
   1204  * Test closure API.
   1205  */
   1206 void UnicodeSetTest::TestCloseOver() {
   1207     UErrorCode ec = U_ZERO_ERROR;
   1208 
   1209     char CASE[] = {(char)USET_CASE_INSENSITIVE};
   1210     char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
   1211     const char* DATA[] = {
   1212         // selector, input, output
   1213         CASE,
   1214         "[aq\\u00DF{Bc}{bC}{Fi}]",
   1215         "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]",  // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
   1216 
   1217         CASE,
   1218         "[\\u01F1]", // 'DZ'
   1219         "[\\u01F1\\u01F2\\u01F3]",
   1220 
   1221         CASE,
   1222         "[\\u1FB4]",
   1223         "[\\u1FB4{\\u03AC\\u03B9}]",
   1224 
   1225         CASE,
   1226         "[{F\\uFB01}]",
   1227         "[\\uFB03{ffi}]",
   1228 
   1229         CASE, // make sure binary search finds limits
   1230         "[a\\uFF3A]",
   1231         "[aA\\uFF3A\\uFF5A]",
   1232 
   1233         CASE,
   1234         "[a-z]","[A-Za-z\\u017F\\u212A]",
   1235         CASE,
   1236         "[abc]","[A-Ca-c]",
   1237         CASE,
   1238         "[ABC]","[A-Ca-c]",
   1239 
   1240         CASE, "[i]", "[iI]",
   1241 
   1242         CASE, "[\\u0130]",          "[\\u0130{i\\u0307}]", // dotted I
   1243         CASE, "[{i\\u0307}]",       "[\\u0130{i\\u0307}]", // i with dot
   1244 
   1245         CASE, "[\\u0131]",          "[\\u0131]", // dotless i
   1246 
   1247         CASE, "[\\u0390]",          "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
   1248 
   1249         CASE, "[\\u03c2]",          "[\\u03a3\\u03c2\\u03c3]", // sigmas
   1250 
   1251         CASE, "[\\u03f2]",          "[\\u03f2\\u03f9]", // lunate sigmas
   1252 
   1253         CASE, "[\\u03f7]",          "[\\u03f7\\u03f8]",
   1254 
   1255         CASE, "[\\u1fe3]",          "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
   1256 
   1257         CASE, "[\\ufb05]",          "[\\ufb05\\ufb06{st}]",
   1258         CASE, "[{st}]",             "[\\ufb05\\ufb06{st}]",
   1259 
   1260         CASE, "[\\U0001044F]",      "[\\U00010427\\U0001044F]",
   1261 
   1262         CASE, "[{a\\u02BE}]",       "[\\u1E9A{a\\u02BE}]", // first in sorted table
   1263 
   1264         CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
   1265 
   1266 #if !UCONFIG_NO_FILE_IO
   1267         CASE_MAPPINGS,
   1268         "[aq\\u00DF{Bc}{bC}{Fi}]",
   1269         "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
   1270 #endif
   1271 
   1272         CASE_MAPPINGS,
   1273         "[\\u01F1]", // 'DZ'
   1274         "[\\u01F1\\u01F2\\u01F3]",
   1275 
   1276         CASE_MAPPINGS,
   1277         "[a-z]",
   1278         "[A-Za-z]",
   1279 
   1280         NULL
   1281     };
   1282 
   1283     UnicodeSet s;
   1284     UnicodeSet t;
   1285     UnicodeString buf;
   1286     for (int32_t i=0; DATA[i]!=NULL; i+=3) {
   1287         int32_t selector = DATA[i][0];
   1288         UnicodeString pat(DATA[i+1], -1, US_INV);
   1289         UnicodeString exp(DATA[i+2], -1, US_INV);
   1290         s.applyPattern(pat, ec);
   1291         s.closeOver(selector);
   1292         t.applyPattern(exp, ec);
   1293         if (U_FAILURE(ec)) {
   1294             errln("FAIL: applyPattern failed");
   1295             continue;
   1296         }
   1297         if (s == t) {
   1298             logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
   1299         } else {
   1300             dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
   1301                   s.toPattern(buf, TRUE) + ", expected " + exp);
   1302         }
   1303     }
   1304 
   1305 #if 0
   1306     /*
   1307      * Unused test code.
   1308      * This was used to compare the old implementation (using USET_CASE)
   1309      * with the new one (using 0x100 temporarily)
   1310      * while transitioning from hardcoded case closure tables in uniset.cpp
   1311      * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
   1312      * and using ucase.c functions for closure.
   1313      * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
   1314      *
   1315      * Note: The old and new implementation never fully matched because
   1316      * the old implementation turned out to not map U+0130 and U+0131 correctly
   1317      * (dotted I and dotless i) and because the old implementation's data tables
   1318      * were outdated compared to Unicode 4.0.1 at the time of the change to the
   1319      * new implementation. (So sigmas and some other characters were not handled
   1320      * according to the newer Unicode version.)
   1321      */
   1322     UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
   1323     UnicodeSetIterator si(sens);
   1324     UnicodeString str, buf2;
   1325     const UnicodeString *pStr;
   1326     UChar32 c;
   1327     while(si.next()) {
   1328         if(!si.isString()) {
   1329             c=si.getCodepoint();
   1330             s.clear();
   1331             s.add(c);
   1332 
   1333             str.setTo(c);
   1334             str.foldCase();
   1335             sens2.add(str);
   1336 
   1337             t=s;
   1338             s.closeOver(USET_CASE);
   1339             t.closeOver(0x100);
   1340             if(s!=t) {
   1341                 errln("FAIL: closeOver(U+%04x) differs: ", c);
   1342                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
   1343             }
   1344         }
   1345     }
   1346     // remove all code points
   1347     // should contain all full case folding mapping strings
   1348     sens2.remove(0, 0x10ffff);
   1349     si.reset(sens2);
   1350     while(si.next()) {
   1351         if(si.isString()) {
   1352             pStr=&si.getString();
   1353             s.clear();
   1354             s.add(*pStr);
   1355             t=s2=s;
   1356             s.closeOver(USET_CASE);
   1357             t.closeOver(0x100);
   1358             if(s!=t) {
   1359                 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
   1360                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
   1361             }
   1362         }
   1363     }
   1364 #endif
   1365 
   1366     // Test the pattern API
   1367     s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
   1368     if (U_FAILURE(ec)) {
   1369         errln("FAIL: applyPattern failed");
   1370     } else {
   1371         expectContainment(s, "abcABC", "defDEF");
   1372     }
   1373     UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
   1374     if (U_FAILURE(ec)) {
   1375         errln("FAIL: constructor failed");
   1376     } else {
   1377         expectContainment(v, "defDEF", "abcABC");
   1378     }
   1379     UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
   1380     if (U_FAILURE(ec)) {
   1381         errln("FAIL: construct w/case mappings failed");
   1382     } else {
   1383         expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
   1384     }
   1385 }
   1386 
   1387 void UnicodeSetTest::TestEscapePattern() {
   1388     const char pattern[] =
   1389         "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
   1390     const char exp[] =
   1391         "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
   1392     // We test this with two passes; in the second pass we
   1393     // pre-unescape the pattern.  Since U+200E is Pattern_White_Space,
   1394     // this fails -- which is what we expect.
   1395     for (int32_t pass=1; pass<=2; ++pass) {
   1396         UErrorCode ec = U_ZERO_ERROR;
   1397         UnicodeString pat(pattern, -1, US_INV);
   1398         if (pass==2) {
   1399             pat = pat.unescape();
   1400         }
   1401         // Pattern is only good for pass 1
   1402         UBool isPatternValid = (pass==1);
   1403 
   1404         UnicodeSet set(pat, ec);
   1405         if (U_SUCCESS(ec) != isPatternValid){
   1406             errln((UnicodeString)"FAIL: applyPattern(" +
   1407                   escape(pat) + ") => " +
   1408                   u_errorName(ec));
   1409             continue;
   1410         }
   1411         if (U_FAILURE(ec)) {
   1412             continue;
   1413         }
   1414         if (set.contains((UChar)0x0644)){
   1415             errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
   1416         }
   1417 
   1418         UnicodeString newpat;
   1419         set.toPattern(newpat, TRUE);
   1420         if (newpat == UnicodeString(exp, -1, US_INV)) {
   1421             logln(escape(pat) + " => " + newpat);
   1422         } else {
   1423             errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
   1424         }
   1425 
   1426         for (int32_t i=0; i<set.getRangeCount(); ++i) {
   1427             UnicodeString str("Range ");
   1428             str.append((UChar)(0x30 + i))
   1429                 .append(": ")
   1430                 .append((UChar32)set.getRangeStart(i))
   1431                 .append(" - ")
   1432                 .append((UChar32)set.getRangeEnd(i));
   1433             str = str + " (" + set.getRangeStart(i) + " - " +
   1434                 set.getRangeEnd(i) + ")";
   1435             if (set.getRangeStart(i) < 0) {
   1436                 errln((UnicodeString)"FAIL: " + escape(str));
   1437             } else {
   1438                 logln(escape(str));
   1439             }
   1440         }
   1441     }
   1442 }
   1443 
   1444 void UnicodeSetTest::expectRange(const UnicodeString& label,
   1445                                  const UnicodeSet& set,
   1446                                  UChar32 start, UChar32 end) {
   1447     UnicodeSet exp(start, end);
   1448     UnicodeString pat;
   1449     if (set == exp) {
   1450         logln(label + " => " + set.toPattern(pat, TRUE));
   1451     } else {
   1452         UnicodeString xpat;
   1453         errln((UnicodeString)"FAIL: " + label + " => " +
   1454               set.toPattern(pat, TRUE) +
   1455               ", expected " + exp.toPattern(xpat, TRUE));
   1456     }
   1457 }
   1458 
   1459 void UnicodeSetTest::TestInvalidCodePoint() {
   1460 
   1461     const UChar32 DATA[] = {
   1462         // Test range             Expected range
   1463         0, 0x10FFFF,              0, 0x10FFFF,
   1464         (UChar32)-1, 8,           0, 8,
   1465         8, 0x110000,              8, 0x10FFFF
   1466     };
   1467     const int32_t DATA_LENGTH = UPRV_LENGTHOF(DATA);
   1468 
   1469     UnicodeString pat;
   1470     int32_t i;
   1471 
   1472     for (i=0; i<DATA_LENGTH; i+=4) {
   1473         UChar32 start  = DATA[i];
   1474         UChar32 end    = DATA[i+1];
   1475         UChar32 xstart = DATA[i+2];
   1476         UChar32 xend   = DATA[i+3];
   1477 
   1478         // Try various API using the test code points
   1479 
   1480         UnicodeSet set(start, end);
   1481         expectRange((UnicodeString)"ct(" + start + "," + end + ")",
   1482                     set, xstart, xend);
   1483 
   1484         set.clear();
   1485         set.set(start, end);
   1486         expectRange((UnicodeString)"set(" + start + "," + end + ")",
   1487                     set, xstart, xend);
   1488 
   1489         UBool b = set.contains(start);
   1490         b = set.contains(start, end);
   1491         b = set.containsNone(start, end);
   1492         b = set.containsSome(start, end);
   1493         (void)b;   // Suppress set but not used warning.
   1494 
   1495         /*int32_t index = set.indexOf(start);*/
   1496 
   1497         set.clear();
   1498         set.add(start);
   1499         set.add(start, end);
   1500         expectRange((UnicodeString)"add(" + start + "," + end + ")",
   1501                     set, xstart, xend);
   1502 
   1503         set.set(0, 0x10FFFF);
   1504         set.retain(start, end);
   1505         expectRange((UnicodeString)"retain(" + start + "," + end + ")",
   1506                     set, xstart, xend);
   1507         set.retain(start);
   1508 
   1509         set.set(0, 0x10FFFF);
   1510         set.remove(start);
   1511         set.remove(start, end);
   1512         set.complement();
   1513         expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
   1514                     set, xstart, xend);
   1515 
   1516         set.set(0, 0x10FFFF);
   1517         set.complement(start, end);
   1518         set.complement();
   1519         expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
   1520                     set, xstart, xend);
   1521         set.complement(start);
   1522     }
   1523 
   1524     const UChar32 DATA2[] = {
   1525         0,
   1526         0x10FFFF,
   1527         (UChar32)-1,
   1528         0x110000
   1529     };
   1530     const int32_t DATA2_LENGTH = UPRV_LENGTHOF(DATA2);
   1531 
   1532     for (i=0; i<DATA2_LENGTH; ++i) {
   1533         UChar32 c = DATA2[i], end = 0x10FFFF;
   1534         UBool valid = (c >= 0 && c <= 0x10FFFF);
   1535 
   1536         UnicodeSet set(0, 0x10FFFF);
   1537 
   1538         // For single-codepoint contains, invalid codepoints are NOT contained
   1539         UBool b = set.contains(c);
   1540         if (b == valid) {
   1541             logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
   1542                   ") = " + b);
   1543         } else {
   1544             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
   1545                   ") = " + b);
   1546         }
   1547 
   1548         // For codepoint range contains, containsNone, and containsSome,
   1549         // invalid or empty (start > end) ranges have UNDEFINED behavior.
   1550         b = set.contains(c, end);
   1551         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
   1552               "," + end + ") = " + b);
   1553 
   1554         b = set.containsNone(c, end);
   1555         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
   1556               "," + end + ") = " + b);
   1557 
   1558         b = set.containsSome(c, end);
   1559         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
   1560               "," + end + ") = " + b);
   1561 
   1562         int32_t index = set.indexOf(c);
   1563         if ((index >= 0) == valid) {
   1564             logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
   1565                   ") = " + index);
   1566         } else {
   1567             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
   1568                   ") = " + index);
   1569         }
   1570     }
   1571 }
   1572 
   1573 // Used by TestSymbolTable
   1574 class TokenSymbolTable : public SymbolTable {
   1575 public:
   1576     Hashtable contents;
   1577 
   1578     TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
   1579         contents.setValueDeleter(uprv_deleteUObject);
   1580     }
   1581 
   1582     ~TokenSymbolTable() {}
   1583 
   1584     /**
   1585      * (Non-SymbolTable API) Add the given variable and value to
   1586      * the table.  Variable should NOT contain leading '$'.
   1587      */
   1588     void add(const UnicodeString& var, const UnicodeString& value,
   1589              UErrorCode& ec) {
   1590         if (U_SUCCESS(ec)) {
   1591             contents.put(var, new UnicodeString(value), ec);
   1592         }
   1593     }
   1594 
   1595     /**
   1596      * SymbolTable API
   1597      */
   1598     virtual const UnicodeString* lookup(const UnicodeString& s) const {
   1599         return (const UnicodeString*) contents.get(s);
   1600     }
   1601 
   1602     /**
   1603      * SymbolTable API
   1604      */
   1605     virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const {
   1606         return NULL;
   1607     }
   1608 
   1609     /**
   1610      * SymbolTable API
   1611      */
   1612     virtual UnicodeString parseReference(const UnicodeString& text,
   1613                                          ParsePosition& pos, int32_t limit) const {
   1614         int32_t start = pos.getIndex();
   1615         int32_t i = start;
   1616         UnicodeString result;
   1617         while (i < limit) {
   1618             UChar c = text.charAt(i);
   1619             if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
   1620                 break;
   1621             }
   1622             ++i;
   1623         }
   1624         if (i == start) { // No valid name chars
   1625             return result; // Indicate failure with empty string
   1626         }
   1627         pos.setIndex(i);
   1628         text.extractBetween(start, i, result);
   1629         return result;
   1630     }
   1631 };
   1632 
   1633 void UnicodeSetTest::TestSymbolTable() {
   1634     // Multiple test cases can be set up here.  Each test case
   1635     // is terminated by null:
   1636     // var, value, var, value,..., input pat., exp. output pat., null
   1637     const char* DATA[] = {
   1638         "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
   1639         "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
   1640         "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
   1641         NULL
   1642     };
   1643 
   1644     for (int32_t i=0; DATA[i]!=NULL; ++i) {
   1645         UErrorCode ec = U_ZERO_ERROR;
   1646         TokenSymbolTable sym(ec);
   1647         if (U_FAILURE(ec)) {
   1648             errln("FAIL: couldn't construct TokenSymbolTable");
   1649             continue;
   1650         }
   1651 
   1652         // Set up variables
   1653         while (DATA[i+2] != NULL) {
   1654             sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);
   1655             if (U_FAILURE(ec)) {
   1656                 errln("FAIL: couldn't add to TokenSymbolTable");
   1657                 continue;
   1658             }
   1659             i += 2;
   1660         }
   1661 
   1662         // Input pattern and expected output pattern
   1663         UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV);
   1664         i += 2;
   1665 
   1666         ParsePosition pos(0);
   1667         UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
   1668         if (U_FAILURE(ec)) {
   1669             errln("FAIL: couldn't construct UnicodeSet");
   1670             continue;
   1671         }
   1672 
   1673         // results
   1674         if (pos.getIndex() != inpat.length()) {
   1675             errln((UnicodeString)"Failed to read to end of string \""
   1676                   + inpat + "\": read to "
   1677                   + pos.getIndex() + ", length is "
   1678                   + inpat.length());
   1679         }
   1680 
   1681         UnicodeSet us2(exppat, ec);
   1682         if (U_FAILURE(ec)) {
   1683             errln("FAIL: couldn't construct expected UnicodeSet");
   1684             continue;
   1685         }
   1686 
   1687         UnicodeString a, b;
   1688         if (us != us2) {
   1689             errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
   1690                   ", expected " + us2.toPattern(b, TRUE));
   1691         } else {
   1692             logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
   1693         }
   1694     }
   1695 }
   1696 
   1697 void UnicodeSetTest::TestSurrogate() {
   1698     const char* DATA[] = {
   1699         // These should all behave identically
   1700         "[abc\\uD800\\uDC00]",
   1701         // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
   1702         "[abc\\U00010000]",
   1703         0
   1704     };
   1705     for (int i=0; DATA[i] != 0; ++i) {
   1706         UErrorCode ec = U_ZERO_ERROR;
   1707         logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));
   1708         UnicodeString str = UnicodeString(DATA[i], -1, US_INV);
   1709         UnicodeSet set(str, ec);
   1710         if (U_FAILURE(ec)) {
   1711             errln("FAIL: UnicodeSet constructor");
   1712             continue;
   1713         }
   1714         expectContainment(set,
   1715                           CharsToUnicodeString("abc\\U00010000"),
   1716                           CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
   1717         if (set.size() != 4) {
   1718             errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +
   1719                   set.size() + ", expected 4");
   1720         }
   1721 
   1722         {
   1723           UErrorCode subErr = U_ZERO_ERROR;
   1724           checkRoundTrip(set);
   1725           checkSerializeRoundTrip(set, subErr);
   1726         }
   1727     }
   1728 }
   1729 
   1730 void UnicodeSetTest::TestExhaustive() {
   1731     // exhaustive tests. Simulate UnicodeSets with integers.
   1732     // That gives us very solid tests (except for large memory tests).
   1733 
   1734     int32_t limit = 128;
   1735 
   1736     UnicodeSet x, y, z, aa;
   1737 
   1738     for (int32_t i = 0; i < limit; ++i) {
   1739         bitsToSet(i, x);
   1740         logln((UnicodeString)"Testing " + i + ", " + x);
   1741         _testComplement(i, x, y);
   1742 
   1743         UnicodeSet &toTest = bitsToSet(i, aa);
   1744 
   1745         // AS LONG AS WE ARE HERE, check roundtrip
   1746         checkRoundTrip(toTest);
   1747         UErrorCode ec = U_ZERO_ERROR;
   1748         checkSerializeRoundTrip(toTest, ec);
   1749 
   1750         for (int32_t j = 0; j < limit; ++j) {
   1751             _testAdd(i,j,  x,y,z);
   1752             _testXor(i,j,  x,y,z);
   1753             _testRetain(i,j,  x,y,z);
   1754             _testRemove(i,j,  x,y,z);
   1755         }
   1756     }
   1757 }
   1758 
   1759 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
   1760     bitsToSet(a, x);
   1761     z = x;
   1762     z.complement();
   1763     int32_t c = setToBits(z);
   1764     if (c != (~a)) {
   1765         errln((UnicodeString)"FAILED: add: ~" + x +  " != " + z);
   1766         errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
   1767     }
   1768     checkCanonicalRep(z, (UnicodeString)"complement " + a);
   1769 }
   1770 
   1771 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
   1772     bitsToSet(a, x);
   1773     bitsToSet(b, y);
   1774     z = x;
   1775     z.addAll(y);
   1776     int32_t c = setToBits(z);
   1777     if (c != (a | b)) {
   1778         errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
   1779         errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
   1780     }
   1781     checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
   1782 }
   1783 
   1784 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
   1785     bitsToSet(a, x);
   1786     bitsToSet(b, y);
   1787     z = x;
   1788     z.retainAll(y);
   1789     int32_t c = setToBits(z);
   1790     if (c != (a & b)) {
   1791         errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
   1792         errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
   1793     }
   1794     checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
   1795 }
   1796 
   1797 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
   1798     bitsToSet(a, x);
   1799     bitsToSet(b, y);
   1800     z = x;
   1801     z.removeAll(y);
   1802     int32_t c = setToBits(z);
   1803     if (c != (a &~ b)) {
   1804         errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
   1805         errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
   1806     }
   1807     checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
   1808 }
   1809 
   1810 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
   1811     bitsToSet(a, x);
   1812     bitsToSet(b, y);
   1813     z = x;
   1814     z.complementAll(y);
   1815     int32_t c = setToBits(z);
   1816     if (c != (a ^ b)) {
   1817         errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
   1818         errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
   1819     }
   1820     checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
   1821 }
   1822 
   1823 /**
   1824  * Check that ranges are monotonically increasing and non-
   1825  * overlapping.
   1826  */
   1827 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
   1828     int32_t n = set.getRangeCount();
   1829     if (n < 0) {
   1830         errln((UnicodeString)"FAIL result of " + msg +
   1831               ": range count should be >= 0 but is " +
   1832               n /*+ " for " + set.toPattern())*/);
   1833         return;
   1834     }
   1835     UChar32 last = 0;
   1836     for (int32_t i=0; i<n; ++i) {
   1837         UChar32 start = set.getRangeStart(i);
   1838         UChar32 end = set.getRangeEnd(i);
   1839         if (start > end) {
   1840             errln((UnicodeString)"FAIL result of " + msg +
   1841                   ": range " + (i+1) +
   1842                   " start > end: " + (int)start + ", " + (int)end +
   1843                   " for " + set);
   1844         }
   1845         if (i > 0 && start <= last) {
   1846             errln((UnicodeString)"FAIL result of " + msg +
   1847                   ": range " + (i+1) +
   1848                   " overlaps previous range: " + (int)start + ", " + (int)end +
   1849                   " for " + set);
   1850         }
   1851         last = end;
   1852     }
   1853 }
   1854 
   1855 /**
   1856  * Convert a bitmask to a UnicodeSet.
   1857  */
   1858 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
   1859     result.clear();
   1860     for (UChar32 i = 0; i < 32; ++i) {
   1861         if ((a & (1<<i)) != 0) {
   1862             result.add(i);
   1863         }
   1864     }
   1865     return result;
   1866 }
   1867 
   1868 /**
   1869  * Convert a UnicodeSet to a bitmask.  Only the characters
   1870  * U+0000 to U+0020 are represented in the bitmask.
   1871  */
   1872 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
   1873     int32_t result = 0;
   1874     for (int32_t i = 0; i < 32; ++i) {
   1875         if (x.contains((UChar32)i)) {
   1876             result |= (1<<i);
   1877         }
   1878     }
   1879     return result;
   1880 }
   1881 
   1882 /**
   1883  * Return the representation of an inversion list based UnicodeSet
   1884  * as a pairs list.  Ranges are listed in ascending Unicode order.
   1885  * For example, the set [a-zA-M3] is represented as "33AMaz".
   1886  */
   1887 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
   1888     UnicodeString pairs;
   1889     for (int32_t i=0; i<set.getRangeCount(); ++i) {
   1890         UChar32 start = set.getRangeStart(i);
   1891         UChar32 end = set.getRangeEnd(i);
   1892         if (end > 0xFFFF) {
   1893             end = 0xFFFF;
   1894             i = set.getRangeCount(); // Should be unnecessary
   1895         }
   1896         pairs.append((UChar)start).append((UChar)end);
   1897     }
   1898     return pairs;
   1899 }
   1900 
   1901 /**
   1902  * Basic consistency check for a few items.
   1903  * That the iterator works, and that we can create a pattern and
   1904  * get the same thing back
   1905  */
   1906 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
   1907     {
   1908         UnicodeSet t(s);
   1909         checkEqual(s, t, "copy ct");
   1910     }
   1911 
   1912     {
   1913         UnicodeSet t(0xabcd, 0xdef0);  // dummy contents should be overwritten
   1914         t = s;
   1915         checkEqual(s, t, "operator=");
   1916     }
   1917 
   1918     {
   1919         UnicodeSet t;
   1920         copyWithIterator(t, s, FALSE);
   1921         checkEqual(s, t, "iterator roundtrip");
   1922     }
   1923 
   1924     {
   1925         UnicodeSet t;
   1926         copyWithIterator(t, s, TRUE); // try range
   1927         checkEqual(s, t, "iterator roundtrip");
   1928     }
   1929 
   1930     {
   1931         UnicodeSet t;
   1932         UnicodeString pat;
   1933         UErrorCode ec = U_ZERO_ERROR;
   1934         s.toPattern(pat, FALSE);
   1935         t.applyPattern(pat, ec);
   1936         if (U_FAILURE(ec)) {
   1937             errln("FAIL: toPattern(escapeUnprintable=FALSE), applyPattern - %s", u_errorName(ec));
   1938             return;
   1939         } else {
   1940             checkEqual(s, t, "toPattern(false)");
   1941         }
   1942     }
   1943 
   1944     {
   1945         UnicodeSet t;
   1946         UnicodeString pat;
   1947         UErrorCode ec = U_ZERO_ERROR;
   1948         s.toPattern(pat, TRUE);
   1949         t.applyPattern(pat, ec);
   1950         if (U_FAILURE(ec)) {
   1951             errln("FAIL: toPattern(escapeUnprintable=TRUE), applyPattern - %s", u_errorName(ec));
   1952             return;
   1953         } else {
   1954             checkEqual(s, t, "toPattern(true)");
   1955         }
   1956     }
   1957 }
   1958 
   1959 void UnicodeSetTest::checkSerializeRoundTrip(const UnicodeSet& t, UErrorCode &status) {
   1960   if(U_FAILURE(status)) return;
   1961   int32_t len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);
   1962   if(status == U_BUFFER_OVERFLOW_ERROR) {
   1963     status = U_ZERO_ERROR;
   1964     serializeBuffer.resize(len);
   1965     len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);
   1966     // let 2nd error stand
   1967   }
   1968   if(U_FAILURE(status)) {
   1969     errln("checkSerializeRoundTrip: error %s serializing buffer\n", u_errorName(status));
   1970     return;
   1971   }
   1972   UnicodeSet deserialized(serializeBuffer.getAlias(), len, UnicodeSet::kSerialized, status);
   1973   if(U_FAILURE(status)) {
   1974     errln("checkSerializeRoundTrip: error %s deserializing buffer: buf %p len %d, original %d\n", u_errorName(status), serializeBuffer.getAlias(), len, t.getRangeCount());
   1975     return;
   1976   }
   1977 
   1978   checkEqual(t, deserialized, "Set was unequal when deserialized");
   1979 }
   1980 
   1981 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
   1982     t.clear();
   1983     UnicodeSetIterator it(s);
   1984     if (withRange) {
   1985         while (it.nextRange()) {
   1986             if (it.isString()) {
   1987                 t.add(it.getString());
   1988             } else {
   1989                 t.add(it.getCodepoint(), it.getCodepointEnd());
   1990             }
   1991         }
   1992     } else {
   1993         while (it.next()) {
   1994             if (it.isString()) {
   1995                 t.add(it.getString());
   1996             } else {
   1997                 t.add(it.getCodepoint());
   1998             }
   1999         }
   2000     }
   2001 }
   2002 
   2003 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
   2004   assertEquals(UnicodeString("RangeCount: ","") + message, s.getRangeCount(), t.getRangeCount());
   2005   assertEquals(UnicodeString("size: ","") + message, s.size(), t.size());
   2006     UnicodeString source; s.toPattern(source, TRUE);
   2007     UnicodeString result; t.toPattern(result, TRUE);
   2008     if (s != t) {
   2009         errln((UnicodeString)"FAIL: " + message
   2010               + "; source = " + source
   2011               + "; result = " + result
   2012               );
   2013         return FALSE;
   2014     } else {
   2015         logln((UnicodeString)"Ok: " + message
   2016               + "; source = " + source
   2017               + "; result = " + result
   2018               );
   2019     }
   2020     return TRUE;
   2021 }
   2022 
   2023 void
   2024 UnicodeSetTest::expectContainment(const UnicodeString& pat,
   2025                                   const UnicodeString& charsIn,
   2026                                   const UnicodeString& charsOut) {
   2027     UErrorCode ec = U_ZERO_ERROR;
   2028     UnicodeSet set(pat, ec);
   2029     if (U_FAILURE(ec)) {
   2030         dataerrln((UnicodeString)"FAIL: pattern \"" +
   2031               pat + "\" => " + u_errorName(ec));
   2032         return;
   2033     }
   2034     expectContainment(set, pat, charsIn, charsOut);
   2035 }
   2036 
   2037 void
   2038 UnicodeSetTest::expectContainment(const UnicodeSet& set,
   2039                                   const UnicodeString& charsIn,
   2040                                   const UnicodeString& charsOut) {
   2041     UnicodeString pat;
   2042     set.toPattern(pat);
   2043     expectContainment(set, pat, charsIn, charsOut);
   2044 }
   2045 
   2046 void
   2047 UnicodeSetTest::expectContainment(const UnicodeSet& set,
   2048                                   const UnicodeString& setName,
   2049                                   const UnicodeString& charsIn,
   2050                                   const UnicodeString& charsOut) {
   2051     UnicodeString bad;
   2052     UChar32 c;
   2053     int32_t i;
   2054 
   2055     for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
   2056         c = charsIn.char32At(i);
   2057         if (!set.contains(c)) {
   2058             bad.append(c);
   2059         }
   2060     }
   2061     if (bad.length() > 0) {
   2062         errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
   2063               ", expected containment of " + prettify(charsIn));
   2064     } else {
   2065         logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
   2066     }
   2067 
   2068     bad.truncate(0);
   2069     for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
   2070         c = charsOut.char32At(i);
   2071         if (set.contains(c)) {
   2072             bad.append(c);
   2073         }
   2074     }
   2075     if (bad.length() > 0) {
   2076         errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
   2077               ", expected non-containment of " + prettify(charsOut));
   2078     } else {
   2079         logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
   2080     }
   2081 }
   2082 
   2083 void
   2084 UnicodeSetTest::expectPattern(UnicodeSet& set,
   2085                               const UnicodeString& pattern,
   2086                               const UnicodeString& expectedPairs){
   2087     UErrorCode status = U_ZERO_ERROR;
   2088     set.applyPattern(pattern, status);
   2089     if (U_FAILURE(status)) {
   2090         errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
   2091               "\") failed");
   2092         return;
   2093     } else {
   2094         if (getPairs(set) != expectedPairs ) {
   2095             errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
   2096                   "\") => pairs \"" +
   2097                   escape(getPairs(set)) + "\", expected \"" +
   2098                   escape(expectedPairs) + "\"");
   2099         } else {
   2100             logln(UnicodeString("Ok:   applyPattern(\"") + pattern +
   2101                   "\") => pairs \"" +
   2102                   escape(getPairs(set)) + "\"");
   2103         }
   2104     }
   2105     // the result of calling set.toPattern(), which is the string representation of
   2106     // this set(set), is passed to a  UnicodeSet constructor, and tested that it
   2107     // will produce another set that is equal to this one.
   2108     UnicodeString temppattern;
   2109     set.toPattern(temppattern);
   2110     UnicodeSet *tempset=new UnicodeSet(temppattern, status);
   2111     if (U_FAILURE(status)) {
   2112         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
   2113         return;
   2114     }
   2115     if(*tempset != set || getPairs(*tempset) != getPairs(set)){
   2116         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
   2117             escape(getPairs(set)) + "\""));
   2118     } else{
   2119         logln(UnicodeString("Ok:   applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
   2120     }
   2121 
   2122     delete tempset;
   2123 
   2124 }
   2125 
   2126 void
   2127 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
   2128     if (getPairs(set) != expectedPairs) {
   2129         errln(UnicodeString("FAIL: Expected pair list \"") +
   2130               escape(expectedPairs) + "\", got \"" +
   2131               escape(getPairs(set)) + "\"");
   2132     }
   2133 }
   2134 
   2135 void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
   2136                                      const UnicodeString& expPat,
   2137                                      const char** expStrings) {
   2138     UnicodeString pat;
   2139     set.toPattern(pat, TRUE);
   2140     if (pat == expPat) {
   2141         logln((UnicodeString)"Ok:   toPattern() => \"" + pat + "\"");
   2142     } else {
   2143         errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
   2144         return;
   2145     }
   2146     if (expStrings == NULL) {
   2147         return;
   2148     }
   2149     UBool in = TRUE;
   2150     for (int32_t i=0; expStrings[i] != NULL; ++i) {
   2151         if (expStrings[i] == NOT) { // sic; pointer comparison
   2152             in = FALSE;
   2153             continue;
   2154         }
   2155         UnicodeString s = CharsToUnicodeString(expStrings[i]);
   2156         UBool contained = set.contains(s);
   2157         if (contained == in) {
   2158             logln((UnicodeString)"Ok: " + expPat +
   2159                   (contained ? " contains {" : " does not contain {") +
   2160                   escape(expStrings[i]) + "}");
   2161         } else {
   2162             errln((UnicodeString)"FAIL: " + expPat +
   2163                   (contained ? " contains {" : " does not contain {") +
   2164                   escape(expStrings[i]) + "}");
   2165         }
   2166     }
   2167 }
   2168 
   2169 static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }
   2170 
   2171 void
   2172 UnicodeSetTest::doAssert(UBool condition, const char *message)
   2173 {
   2174     if (!condition) {
   2175         errln(UnicodeString("ERROR : ") + message);
   2176     }
   2177 }
   2178 
   2179 UnicodeString
   2180 UnicodeSetTest::escape(const UnicodeString& s) {
   2181     UnicodeString buf;
   2182     for (int32_t i=0; i<s.length(); )
   2183     {
   2184         UChar32 c = s.char32At(i);
   2185         if (0x0020 <= c && c <= 0x007F) {
   2186             buf += c;
   2187         } else {
   2188             if (c <= 0xFFFF) {
   2189                 buf += (UChar)0x5c; buf += (UChar)0x75;
   2190             } else {
   2191                 buf += (UChar)0x5c; buf += (UChar)0x55;
   2192                 buf += toHexString((c & 0xF0000000) >> 28);
   2193                 buf += toHexString((c & 0x0F000000) >> 24);
   2194                 buf += toHexString((c & 0x00F00000) >> 20);
   2195                 buf += toHexString((c & 0x000F0000) >> 16);
   2196             }
   2197             buf += toHexString((c & 0xF000) >> 12);
   2198             buf += toHexString((c & 0x0F00) >> 8);
   2199             buf += toHexString((c & 0x00F0) >> 4);
   2200             buf += toHexString(c & 0x000F);
   2201         }
   2202         i += U16_LENGTH(c);
   2203     }
   2204     return buf;
   2205 }
   2206 
   2207 void UnicodeSetTest::TestFreezable() {
   2208     UErrorCode errorCode=U_ZERO_ERROR;
   2209     UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
   2210     UnicodeSet idSet(idPattern, errorCode);
   2211     if(U_FAILURE(errorCode)) {
   2212         dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
   2213         return;
   2214     }
   2215 
   2216     UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
   2217     UnicodeSet wsSet(wsPattern, errorCode);
   2218     if(U_FAILURE(errorCode)) {
   2219         dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
   2220         return;
   2221     }
   2222 
   2223     idSet.add(idPattern);
   2224     UnicodeSet frozen(idSet);
   2225     frozen.freeze();
   2226 
   2227     if(idSet.isFrozen() || !frozen.isFrozen()) {
   2228         errln("FAIL: isFrozen() is wrong");
   2229     }
   2230     if(frozen!=idSet || !(frozen==idSet)) {
   2231         errln("FAIL: a copy-constructed frozen set differs from its original");
   2232     }
   2233 
   2234     frozen=wsSet;
   2235     if(frozen!=idSet || !(frozen==idSet)) {
   2236         errln("FAIL: a frozen set was modified by operator=");
   2237     }
   2238 
   2239     UnicodeSet frozen2(frozen);
   2240     if(frozen2!=frozen || frozen2!=idSet) {
   2241         errln("FAIL: a copied frozen set differs from its frozen original");
   2242     }
   2243     if(!frozen2.isFrozen()) {
   2244         errln("FAIL: copy-constructing a frozen set results in a thawed one");
   2245     }
   2246     UnicodeSet frozen3(5, 55);  // Set to some values to really test assignment below, not copy construction.
   2247     if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) {
   2248         errln("FAIL: UnicodeSet(5, 55) failed");
   2249     }
   2250     frozen3=frozen;
   2251     if(!frozen3.isFrozen()) {
   2252         errln("FAIL: copying a frozen set results in a thawed one");
   2253     }
   2254 
   2255     UnicodeSet *cloned=(UnicodeSet *)frozen.clone();
   2256     if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) {
   2257         errln("FAIL: clone() failed");
   2258     }
   2259     cloned->add(0xd802, 0xd805);
   2260     if(cloned->containsSome(0xd802, 0xd805)) {
   2261         errln("FAIL: unable to modify clone");
   2262     }
   2263     delete cloned;
   2264 
   2265     UnicodeSet *thawed=(UnicodeSet *)frozen.cloneAsThawed();
   2266     if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) {
   2267         errln("FAIL: cloneAsThawed() failed");
   2268     }
   2269     thawed->add(0xd802, 0xd805);
   2270     if(!thawed->contains(0xd802, 0xd805)) {
   2271         errln("FAIL: unable to modify thawed clone");
   2272     }
   2273     delete thawed;
   2274 
   2275     frozen.set(5, 55);
   2276     if(frozen!=idSet || !(frozen==idSet)) {
   2277         errln("FAIL: UnicodeSet::set() modified a frozen set");
   2278     }
   2279 
   2280     frozen.clear();
   2281     if(frozen!=idSet || !(frozen==idSet)) {
   2282         errln("FAIL: UnicodeSet::clear() modified a frozen set");
   2283     }
   2284 
   2285     frozen.closeOver(USET_CASE_INSENSITIVE);
   2286     if(frozen!=idSet || !(frozen==idSet)) {
   2287         errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
   2288     }
   2289 
   2290     frozen.compact();
   2291     if(frozen!=idSet || !(frozen==idSet)) {
   2292         errln("FAIL: UnicodeSet::compact() modified a frozen set");
   2293     }
   2294 
   2295     ParsePosition pos;
   2296     frozen.
   2297         applyPattern(wsPattern, errorCode).
   2298         applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).
   2299         applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).
   2300         applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
   2301         applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode);
   2302     if(frozen!=idSet || !(frozen==idSet)) {
   2303         errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
   2304     }
   2305 
   2306     frozen.
   2307         add(0xd800).
   2308         add(0xd802, 0xd805).
   2309         add(wsPattern).
   2310         addAll(idPattern).
   2311         addAll(wsSet);
   2312     if(frozen!=idSet || !(frozen==idSet)) {
   2313         errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
   2314     }
   2315 
   2316     frozen.
   2317         retain(0x62).
   2318         retain(0x64, 0x69).
   2319         retainAll(wsPattern).
   2320         retainAll(wsSet);
   2321     if(frozen!=idSet || !(frozen==idSet)) {
   2322         errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
   2323     }
   2324 
   2325     frozen.
   2326         remove(0x62).
   2327         remove(0x64, 0x69).
   2328         remove(idPattern).
   2329         removeAll(idPattern).
   2330         removeAll(idSet);
   2331     if(frozen!=idSet || !(frozen==idSet)) {
   2332         errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
   2333     }
   2334 
   2335     frozen.
   2336         complement().
   2337         complement(0x62).
   2338         complement(0x64, 0x69).
   2339         complement(idPattern).
   2340         complementAll(idPattern).
   2341         complementAll(idSet);
   2342     if(frozen!=idSet || !(frozen==idSet)) {
   2343         errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
   2344     }
   2345 }
   2346 
   2347 // Test span() etc. -------------------------------------------------------- ***
   2348 
   2349 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
   2350 static int32_t
   2351 appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) {
   2352     UErrorCode errorCode=U_ZERO_ERROR;
   2353     int32_t length8=0;
   2354     u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
   2355     if(U_SUCCESS(errorCode)) {
   2356         return length8;
   2357     } else {
   2358         // The string contains an unpaired surrogate.
   2359         // Ignore this string.
   2360         return 0;
   2361     }
   2362 }
   2363 
   2364 class UnicodeSetWithStringsIterator;
   2365 
   2366 // Make the strings in a UnicodeSet easily accessible.
   2367 class UnicodeSetWithStrings {
   2368 public:
   2369     UnicodeSetWithStrings(const UnicodeSet &normalSet) :
   2370             set(normalSet), stringsLength(0), hasSurrogates(FALSE) {
   2371         int32_t size=set.size();
   2372         if(size>0 && set.charAt(size-1)<0) {
   2373             // If a set's last element is not a code point, then it must contain strings.
   2374             // Iterate over the set, skip all code point ranges, and cache the strings.
   2375             // Convert them to UTF-8 for spanUTF8().
   2376             UnicodeSetIterator iter(set);
   2377             const UnicodeString *s;
   2378             char *s8=utf8;
   2379             int32_t length8, utf8Count=0;
   2380             while(iter.nextRange() && stringsLength<UPRV_LENGTHOF(strings)) {
   2381                 if(iter.isString()) {
   2382                     // Store the pointer to the set's string element
   2383                     // which we happen to know is a stable pointer.
   2384                     strings[stringsLength]=s=&iter.getString();
   2385                     utf8Count+=
   2386                         utf8Lengths[stringsLength]=length8=
   2387                         appendUTF8(s->getBuffer(), s->length(),
   2388                                    s8, (int32_t)(sizeof(utf8)-utf8Count));
   2389                     if(length8==0) {
   2390                         hasSurrogates=TRUE;  // Contains unpaired surrogates.
   2391                     }
   2392                     s8+=length8;
   2393                     ++stringsLength;
   2394                 }
   2395             }
   2396         }
   2397     }
   2398 
   2399     const UnicodeSet &getSet() const {
   2400         return set;
   2401     }
   2402 
   2403     UBool hasStrings() const {
   2404         return (UBool)(stringsLength>0);
   2405     }
   2406 
   2407     UBool hasStringsWithSurrogates() const {
   2408         return hasSurrogates;
   2409     }
   2410 
   2411 private:
   2412     friend class UnicodeSetWithStringsIterator;
   2413 
   2414     const UnicodeSet &set;
   2415 
   2416     const UnicodeString *strings[20];
   2417     int32_t stringsLength;
   2418     UBool hasSurrogates;
   2419 
   2420     char utf8[1024];
   2421     int32_t utf8Lengths[20];
   2422 };
   2423 
   2424 class UnicodeSetWithStringsIterator {
   2425 public:
   2426     UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
   2427             fSet(set), nextStringIndex(0), nextUTF8Start(0) {
   2428     }
   2429 
   2430     void reset() {
   2431         nextStringIndex=nextUTF8Start=0;
   2432     }
   2433 
   2434     const UnicodeString *nextString() {
   2435         if(nextStringIndex<fSet.stringsLength) {
   2436             return fSet.strings[nextStringIndex++];
   2437         } else {
   2438             return NULL;
   2439         }
   2440     }
   2441 
   2442     // Do not mix with calls to nextString().
   2443     const char *nextUTF8(int32_t &length) {
   2444         if(nextStringIndex<fSet.stringsLength) {
   2445             const char *s8=fSet.utf8+nextUTF8Start;
   2446             nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
   2447             return s8;
   2448         } else {
   2449             length=0;
   2450             return NULL;
   2451         }
   2452     }
   2453 
   2454 private:
   2455     const UnicodeSetWithStrings &fSet;
   2456     int32_t nextStringIndex;
   2457     int32_t nextUTF8Start;
   2458 };
   2459 
   2460 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
   2461 // at code point boundaries.
   2462 // That is, each edge of a match must not be in the middle of a surrogate pair.
   2463 static inline UBool
   2464 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) {
   2465     s+=start;
   2466     limit-=start;
   2467     int32_t length=t.length();
   2468     return 0==t.compare(s, length) &&
   2469            !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
   2470            !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
   2471 }
   2472 
   2473 // Implement span() with contains() for comparison.
   2474 static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
   2475                                  USetSpanCondition spanCondition) {
   2476     const UnicodeSet &realSet(set.getSet());
   2477     if(!set.hasStrings()) {
   2478         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
   2479             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
   2480         }
   2481 
   2482         UChar32 c;
   2483         int32_t start=0, prev;
   2484         while((prev=start)<length) {
   2485             U16_NEXT(s, start, length, c);
   2486             if(realSet.contains(c)!=spanCondition) {
   2487                 break;
   2488             }
   2489         }
   2490         return prev;
   2491     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
   2492         UnicodeSetWithStringsIterator iter(set);
   2493         UChar32 c;
   2494         int32_t start, next;
   2495         for(start=next=0; start<length;) {
   2496             U16_NEXT(s, next, length, c);
   2497             if(realSet.contains(c)) {
   2498                 break;
   2499             }
   2500             const UnicodeString *str;
   2501             iter.reset();
   2502             while((str=iter.nextString())!=NULL) {
   2503                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
   2504                     // spanNeedsStrings=TRUE;
   2505                     return start;
   2506                 }
   2507             }
   2508             start=next;
   2509         }
   2510         return start;
   2511     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
   2512         UnicodeSetWithStringsIterator iter(set);
   2513         UChar32 c;
   2514         int32_t start, next, maxSpanLimit=0;
   2515         for(start=next=0; start<length;) {
   2516             U16_NEXT(s, next, length, c);
   2517             if(!realSet.contains(c)) {
   2518                 next=start;  // Do not span this single, not-contained code point.
   2519             }
   2520             const UnicodeString *str;
   2521             iter.reset();
   2522             while((str=iter.nextString())!=NULL) {
   2523                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
   2524                     // spanNeedsStrings=TRUE;
   2525                     int32_t matchLimit=start+str->length();
   2526                     if(matchLimit==length) {
   2527                         return length;
   2528                     }
   2529                     if(spanCondition==USET_SPAN_CONTAINED) {
   2530                         // Iterate for the shortest match at each position.
   2531                         // Recurse for each but the shortest match.
   2532                         if(next==start) {
   2533                             next=matchLimit;  // First match from start.
   2534                         } else {
   2535                             if(matchLimit<next) {
   2536                                 // Remember shortest match from start for iteration.
   2537                                 int32_t temp=next;
   2538                                 next=matchLimit;
   2539                                 matchLimit=temp;
   2540                             }
   2541                             // Recurse for non-shortest match from start.
   2542                             int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
   2543                                                                  USET_SPAN_CONTAINED);
   2544                             if((matchLimit+spanLength)>maxSpanLimit) {
   2545                                 maxSpanLimit=matchLimit+spanLength;
   2546                                 if(maxSpanLimit==length) {
   2547                                     return length;
   2548                                 }
   2549                             }
   2550                         }
   2551                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
   2552                         if(matchLimit>next) {
   2553                             // Remember longest match from start.
   2554                             next=matchLimit;
   2555                         }
   2556                     }
   2557                 }
   2558             }
   2559             if(next==start) {
   2560                 break;  // No match from start.
   2561             }
   2562             start=next;
   2563         }
   2564         if(start>maxSpanLimit) {
   2565             return start;
   2566         } else {
   2567             return maxSpanLimit;
   2568         }
   2569     }
   2570 }
   2571 
   2572 static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
   2573                                      USetSpanCondition spanCondition) {
   2574     if(length==0) {
   2575         return 0;
   2576     }
   2577     const UnicodeSet &realSet(set.getSet());
   2578     if(!set.hasStrings()) {
   2579         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
   2580             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
   2581         }
   2582 
   2583         UChar32 c;
   2584         int32_t prev=length;
   2585         do {
   2586             U16_PREV(s, 0, length, c);
   2587             if(realSet.contains(c)!=spanCondition) {
   2588                 break;
   2589             }
   2590         } while((prev=length)>0);
   2591         return prev;
   2592     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
   2593         UnicodeSetWithStringsIterator iter(set);
   2594         UChar32 c;
   2595         int32_t prev=length, length0=length;
   2596         do {
   2597             U16_PREV(s, 0, length, c);
   2598             if(realSet.contains(c)) {
   2599                 break;
   2600             }
   2601             const UnicodeString *str;
   2602             iter.reset();
   2603             while((str=iter.nextString())!=NULL) {
   2604                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
   2605                     // spanNeedsStrings=TRUE;
   2606                     return prev;
   2607                 }
   2608             }
   2609         } while((prev=length)>0);
   2610         return prev;
   2611     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
   2612         UnicodeSetWithStringsIterator iter(set);
   2613         UChar32 c;
   2614         int32_t prev=length, minSpanStart=length, length0=length;
   2615         do {
   2616             U16_PREV(s, 0, length, c);
   2617             if(!realSet.contains(c)) {
   2618                 length=prev;  // Do not span this single, not-contained code point.
   2619             }
   2620             const UnicodeString *str;
   2621             iter.reset();
   2622             while((str=iter.nextString())!=NULL) {
   2623                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
   2624                     // spanNeedsStrings=TRUE;
   2625                     int32_t matchStart=prev-str->length();
   2626                     if(matchStart==0) {
   2627                         return 0;
   2628                     }
   2629                     if(spanCondition==USET_SPAN_CONTAINED) {
   2630                         // Iterate for the shortest match at each position.
   2631                         // Recurse for each but the shortest match.
   2632                         if(length==prev) {
   2633                             length=matchStart;  // First match from prev.
   2634                         } else {
   2635                             if(matchStart>length) {
   2636                                 // Remember shortest match from prev for iteration.
   2637                                 int32_t temp=length;
   2638                                 length=matchStart;
   2639                                 matchStart=temp;
   2640                             }
   2641                             // Recurse for non-shortest match from prev.
   2642                             int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
   2643                                                                     USET_SPAN_CONTAINED);
   2644                             if(spanStart<minSpanStart) {
   2645                                 minSpanStart=spanStart;
   2646                                 if(minSpanStart==0) {
   2647                                     return 0;
   2648                                 }
   2649                             }
   2650                         }
   2651                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
   2652                         if(matchStart<length) {
   2653                             // Remember longest match from prev.
   2654                             length=matchStart;
   2655                         }
   2656                     }
   2657                 }
   2658             }
   2659             if(length==prev) {
   2660                 break;  // No match from prev.
   2661             }
   2662         } while((prev=length)>0);
   2663         if(prev<minSpanStart) {
   2664             return prev;
   2665         } else {
   2666             return minSpanStart;
   2667         }
   2668     }
   2669 }
   2670 
   2671 static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
   2672                                 USetSpanCondition spanCondition) {
   2673     const UnicodeSet &realSet(set.getSet());
   2674     if(!set.hasStrings()) {
   2675         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
   2676             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
   2677         }
   2678 
   2679         UChar32 c;
   2680         int32_t start=0, prev;
   2681         while((prev=start)<length) {
   2682             U8_NEXT_OR_FFFD(s, start, length, c);
   2683             if(realSet.contains(c)!=spanCondition) {
   2684                 break;
   2685             }
   2686         }
   2687         return prev;
   2688     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
   2689         UnicodeSetWithStringsIterator iter(set);
   2690         UChar32 c;
   2691         int32_t start, next;
   2692         for(start=next=0; start<length;) {
   2693             U8_NEXT_OR_FFFD(s, next, length, c);
   2694             if(realSet.contains(c)) {
   2695                 break;
   2696             }
   2697             const char *s8;
   2698             int32_t length8;
   2699             iter.reset();
   2700             while((s8=iter.nextUTF8(length8))!=NULL) {
   2701                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
   2702                     // spanNeedsStrings=TRUE;
   2703                     return start;
   2704                 }
   2705             }
   2706             start=next;
   2707         }
   2708         return start;
   2709     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
   2710         UnicodeSetWithStringsIterator iter(set);
   2711         UChar32 c;
   2712         int32_t start, next, maxSpanLimit=0;
   2713         for(start=next=0; start<length;) {
   2714             U8_NEXT_OR_FFFD(s, next, length, c);
   2715             if(!realSet.contains(c)) {
   2716                 next=start;  // Do not span this single, not-contained code point.
   2717             }
   2718             const char *s8;
   2719             int32_t length8;
   2720             iter.reset();
   2721             while((s8=iter.nextUTF8(length8))!=NULL) {
   2722                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
   2723                     // spanNeedsStrings=TRUE;
   2724                     int32_t matchLimit=start+length8;
   2725                     if(matchLimit==length) {
   2726                         return length;
   2727                     }
   2728                     if(spanCondition==USET_SPAN_CONTAINED) {
   2729                         // Iterate for the shortest match at each position.
   2730                         // Recurse for each but the shortest match.
   2731                         if(next==start) {
   2732                             next=matchLimit;  // First match from start.
   2733                         } else {
   2734                             if(matchLimit<next) {
   2735                                 // Remember shortest match from start for iteration.
   2736                                 int32_t temp=next;
   2737                                 next=matchLimit;
   2738                                 matchLimit=temp;
   2739                             }
   2740                             // Recurse for non-shortest match from start.
   2741                             int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
   2742                                                                 USET_SPAN_CONTAINED);
   2743                             if((matchLimit+spanLength)>maxSpanLimit) {
   2744                                 maxSpanLimit=matchLimit+spanLength;
   2745                                 if(maxSpanLimit==length) {
   2746                                     return length;
   2747                                 }
   2748                             }
   2749                         }
   2750                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
   2751                         if(matchLimit>next) {
   2752                             // Remember longest match from start.
   2753                             next=matchLimit;
   2754                         }
   2755                     }
   2756                 }
   2757             }
   2758             if(next==start) {
   2759                 break;  // No match from start.
   2760             }
   2761             start=next;
   2762         }
   2763         if(start>maxSpanLimit) {
   2764             return start;
   2765         } else {
   2766             return maxSpanLimit;
   2767         }
   2768     }
   2769 }
   2770 
   2771 static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
   2772                                     USetSpanCondition spanCondition) {
   2773     if(length==0) {
   2774         return 0;
   2775     }
   2776     const UnicodeSet &realSet(set.getSet());
   2777     if(!set.hasStrings()) {
   2778         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
   2779             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
   2780         }
   2781 
   2782         UChar32 c;
   2783         int32_t prev=length;
   2784         do {
   2785             U8_PREV_OR_FFFD(s, 0, length, c);
   2786             if(realSet.contains(c)!=spanCondition) {
   2787                 break;
   2788             }
   2789         } while((prev=length)>0);
   2790         return prev;
   2791     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
   2792         UnicodeSetWithStringsIterator iter(set);
   2793         UChar32 c;
   2794         int32_t prev=length;
   2795         do {
   2796             U8_PREV_OR_FFFD(s, 0, length, c);
   2797             if(realSet.contains(c)) {
   2798                 break;
   2799             }
   2800             const char *s8;
   2801             int32_t length8;
   2802             iter.reset();
   2803             while((s8=iter.nextUTF8(length8))!=NULL) {
   2804                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
   2805                     // spanNeedsStrings=TRUE;
   2806                     return prev;
   2807                 }
   2808             }
   2809         } while((prev=length)>0);
   2810         return prev;
   2811     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
   2812         UnicodeSetWithStringsIterator iter(set);
   2813         UChar32 c;
   2814         int32_t prev=length, minSpanStart=length;
   2815         do {
   2816             U8_PREV_OR_FFFD(s, 0, length, c);
   2817             if(!realSet.contains(c)) {
   2818                 length=prev;  // Do not span this single, not-contained code point.
   2819             }
   2820             const char *s8;
   2821             int32_t length8;
   2822             iter.reset();
   2823             while((s8=iter.nextUTF8(length8))!=NULL) {
   2824                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
   2825                     // spanNeedsStrings=TRUE;
   2826                     int32_t matchStart=prev-length8;
   2827                     if(matchStart==0) {
   2828                         return 0;
   2829                     }
   2830                     if(spanCondition==USET_SPAN_CONTAINED) {
   2831                         // Iterate for the shortest match at each position.
   2832                         // Recurse for each but the shortest match.
   2833                         if(length==prev) {
   2834                             length=matchStart;  // First match from prev.
   2835                         } else {
   2836                             if(matchStart>length) {
   2837                                 // Remember shortest match from prev for iteration.
   2838                                 int32_t temp=length;
   2839                                 length=matchStart;
   2840                                 matchStart=temp;
   2841                             }
   2842                             // Recurse for non-shortest match from prev.
   2843                             int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
   2844                                                                    USET_SPAN_CONTAINED);
   2845                             if(spanStart<minSpanStart) {
   2846                                 minSpanStart=spanStart;
   2847                                 if(minSpanStart==0) {
   2848                                     return 0;
   2849                                 }
   2850                             }
   2851                         }
   2852                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
   2853                         if(matchStart<length) {
   2854                             // Remember longest match from prev.
   2855                             length=matchStart;
   2856                         }
   2857                     }
   2858                 }
   2859             }
   2860             if(length==prev) {
   2861                 break;  // No match from prev.
   2862             }
   2863         } while((prev=length)>0);
   2864         if(prev<minSpanStart) {
   2865             return prev;
   2866         } else {
   2867             return minSpanStart;
   2868         }
   2869     }
   2870 }
   2871 
   2872 // spans to be performed and compared
   2873 enum {
   2874     SPAN_UTF16          =1,
   2875     SPAN_UTF8           =2,
   2876     SPAN_UTFS           =3,
   2877 
   2878     SPAN_SET            =4,
   2879     SPAN_COMPLEMENT     =8,
   2880     SPAN_POLARITY       =0xc,
   2881 
   2882     SPAN_FWD            =0x10,
   2883     SPAN_BACK           =0x20,
   2884     SPAN_DIRS           =0x30,
   2885 
   2886     SPAN_CONTAINED      =0x100,
   2887     SPAN_SIMPLE         =0x200,
   2888     SPAN_CONDITION      =0x300,
   2889 
   2890     SPAN_ALL            =0x33f
   2891 };
   2892 
   2893 static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
   2894     return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
   2895 }
   2896 
   2897 static inline int32_t slen(const void *s, UBool isUTF16) {
   2898     return isUTF16 ? u_strlen((const UChar *)s) : strlen((const char *)s);
   2899 }
   2900 
   2901 /*
   2902  * Count spans on a string with the method according to type and set the span limits.
   2903  * The set may be the complement of the original.
   2904  * When using spanBack() and comparing with span(), use a span condition for the first spanBack()
   2905  * according to the expected number of spans.
   2906  * Sets typeName to an empty string if there is no such type.
   2907  * Returns -1 if the span option is filtered out.
   2908  */
   2909 static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
   2910                         const void *s, int32_t length, UBool isUTF16,
   2911                         uint32_t whichSpans,
   2912                         int type, const char *&typeName,
   2913                         int32_t limits[], int32_t limitsCapacity,
   2914                         int32_t expectCount) {
   2915     const UnicodeSet &realSet(set.getSet());
   2916     int32_t start, count;
   2917     USetSpanCondition spanCondition, firstSpanCondition, contained;
   2918     UBool isForward;
   2919 
   2920     if(type<0 || 7<type) {
   2921         typeName="";
   2922         return 0;
   2923     }
   2924 
   2925     static const char *const typeNames16[]={
   2926         "contains", "contains(LM)",
   2927         "span", "span(LM)",
   2928         "containsBack", "containsBack(LM)",
   2929         "spanBack", "spanBack(LM)"
   2930     };
   2931 
   2932     static const char *const typeNames8[]={
   2933         "containsUTF8", "containsUTF8(LM)",
   2934         "spanUTF8", "spanUTF8(LM)",
   2935         "containsBackUTF8", "containsBackUTF8(LM)", // not implemented
   2936         "spanBackUTF8", "spanBackUTF8(LM)"
   2937     };
   2938 
   2939     typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
   2940 
   2941     // filter span options
   2942     if(type<=3) {
   2943         // span forward
   2944         if((whichSpans&SPAN_FWD)==0) {
   2945             return -1;
   2946         }
   2947         isForward=TRUE;
   2948     } else {
   2949         // span backward
   2950         if((whichSpans&SPAN_BACK)==0) {
   2951             return -1;
   2952         }
   2953         isForward=FALSE;
   2954     }
   2955     if((type&1)==0) {
   2956         // use USET_SPAN_CONTAINED
   2957         if((whichSpans&SPAN_CONTAINED)==0) {
   2958             return -1;
   2959         }
   2960         contained=USET_SPAN_CONTAINED;
   2961     } else {
   2962         // use USET_SPAN_SIMPLE
   2963         if((whichSpans&SPAN_SIMPLE)==0) {
   2964             return -1;
   2965         }
   2966         contained=USET_SPAN_SIMPLE;
   2967     }
   2968 
   2969     // Default first span condition for going forward with an uncomplemented set.
   2970     spanCondition=USET_SPAN_NOT_CONTAINED;
   2971     if(isComplement) {
   2972         spanCondition=invertSpanCondition(spanCondition, contained);
   2973     }
   2974 
   2975     // First span condition for span(), used to terminate the spanBack() iteration.
   2976     firstSpanCondition=spanCondition;
   2977 
   2978     // spanBack(): Its initial span condition is span()'s last span condition,
   2979     // which is the opposite of span()'s first span condition
   2980     // if we expect an even number of spans.
   2981     // (The loop inverts spanCondition (expectCount-1) times
   2982     // before the expectCount'th span() call.)
   2983     // If we do not compare forward and backward directions, then we do not have an
   2984     // expectCount and just start with firstSpanCondition.
   2985     if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
   2986         spanCondition=invertSpanCondition(spanCondition, contained);
   2987     }
   2988 
   2989     count=0;
   2990     switch(type) {
   2991     case 0:
   2992     case 1:
   2993         start=0;
   2994         if(length<0) {
   2995             length=slen(s, isUTF16);
   2996         }
   2997         for(;;) {
   2998             start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) :
   2999                               containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
   3000             if(count<limitsCapacity) {
   3001                 limits[count]=start;
   3002             }
   3003             ++count;
   3004             if(start>=length) {
   3005                 break;
   3006             }
   3007             spanCondition=invertSpanCondition(spanCondition, contained);
   3008         }
   3009         break;
   3010     case 2:
   3011     case 3:
   3012         start=0;
   3013         for(;;) {
   3014             start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) :
   3015                               realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
   3016             if(count<limitsCapacity) {
   3017                 limits[count]=start;
   3018             }
   3019             ++count;
   3020             if(length>=0 ? start>=length :
   3021                            isUTF16 ? ((const UChar *)s)[start]==0 :
   3022                                      ((const char *)s)[start]==0
   3023             ) {
   3024                 break;
   3025             }
   3026             spanCondition=invertSpanCondition(spanCondition, contained);
   3027         }
   3028         break;
   3029     case 4:
   3030     case 5:
   3031         if(length<0) {
   3032             length=slen(s, isUTF16);
   3033         }
   3034         for(;;) {
   3035             ++count;
   3036             if(count<=limitsCapacity) {
   3037                 limits[limitsCapacity-count]=length;
   3038             }
   3039             length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) :
   3040                               containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
   3041             if(length==0 && spanCondition==firstSpanCondition) {
   3042                 break;
   3043             }
   3044             spanCondition=invertSpanCondition(spanCondition, contained);
   3045         }
   3046         if(count<limitsCapacity) {
   3047             memmove(limits, limits+(limitsCapacity-count), count*4);
   3048         }
   3049         break;
   3050     case 6:
   3051     case 7:
   3052         for(;;) {
   3053             ++count;
   3054             if(count<=limitsCapacity) {
   3055                 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
   3056             }
   3057             // Note: Length<0 is tested only for the first spanBack().
   3058             // If we wanted to keep length<0 for all spanBack()s, we would have to
   3059             // temporarily modify the string by placing a NUL where the previous spanBack() stopped.
   3060             length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) :
   3061                               realSet.spanBackUTF8((const char *)s, length, spanCondition);
   3062             if(length==0 && spanCondition==firstSpanCondition) {
   3063                 break;
   3064             }
   3065             spanCondition=invertSpanCondition(spanCondition, contained);
   3066         }
   3067         if(count<limitsCapacity) {
   3068             memmove(limits, limits+(limitsCapacity-count), count*4);
   3069         }
   3070         break;
   3071     default:
   3072         typeName="";
   3073         return -1;
   3074     }
   3075 
   3076     return count;
   3077 }
   3078 
   3079 // sets to be tested; odd index=isComplement
   3080 enum {
   3081     SLOW,
   3082     SLOW_NOT,
   3083     FAST,
   3084     FAST_NOT,
   3085     SET_COUNT
   3086 };
   3087 
   3088 static const char *const setNames[SET_COUNT]={
   3089     "slow",
   3090     "slow.not",
   3091     "fast",
   3092     "fast.not"
   3093 };
   3094 
   3095 /*
   3096  * Verify that we get the same results whether we look at text with contains(),
   3097  * span() or spanBack(), using unfrozen or frozen versions of the set,
   3098  * and using the set or its complement (switching the spanConditions accordingly).
   3099  * The latter verifies that
   3100  *   set.span(spanCondition) == set.complement().span(!spanCondition).
   3101  *
   3102  * The expectLimits[] are either provided by the caller (with expectCount>=0)
   3103  * or returned to the caller (with an input expectCount<0).
   3104  */
   3105 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
   3106                               const void *s, int32_t length, UBool isUTF16,
   3107                               uint32_t whichSpans,
   3108                               int32_t expectLimits[], int32_t &expectCount,
   3109                               const char *testName, int32_t index) {
   3110     int32_t limits[500];
   3111     int32_t limitsCount;
   3112     int i, j;
   3113 
   3114     const char *typeName;
   3115     int type;
   3116 
   3117     for(i=0; i<SET_COUNT; ++i) {
   3118         if((i&1)==0) {
   3119             // Even-numbered sets are original, uncomplemented sets.
   3120             if((whichSpans&SPAN_SET)==0) {
   3121                 continue;
   3122             }
   3123         } else {
   3124             // Odd-numbered sets are complemented.
   3125             if((whichSpans&SPAN_COMPLEMENT)==0) {
   3126                 continue;
   3127             }
   3128         }
   3129         for(type=0;; ++type) {
   3130             limitsCount=getSpans(*sets[i], (UBool)(i&1),
   3131                                  s, length, isUTF16,
   3132                                  whichSpans,
   3133                                  type, typeName,
   3134                                  limits, UPRV_LENGTHOF(limits), expectCount);
   3135             if(typeName[0]==0) {
   3136                 break; // All types tried.
   3137             }
   3138             if(limitsCount<0) {
   3139                 continue; // Span option filtered out.
   3140             }
   3141             if(expectCount<0) {
   3142                 expectCount=limitsCount;
   3143                 if(limitsCount>UPRV_LENGTHOF(limits)) {
   3144                     errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
   3145                           testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)UPRV_LENGTHOF(limits));
   3146                     return;
   3147                 }
   3148                 memcpy(expectLimits, limits, limitsCount*4);
   3149             } else if(limitsCount!=expectCount) {
   3150                 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
   3151                       testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
   3152             } else {
   3153                 for(j=0; j<limitsCount; ++j) {
   3154                     if(limits[j]!=expectLimits[j]) {
   3155                         errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
   3156                               testName, (long)index, setNames[i], typeName, (long)limitsCount,
   3157                               j, (long)limits[j], (long)expectLimits[j]);
   3158                         break;
   3159                     }
   3160                 }
   3161             }
   3162         }
   3163     }
   3164 
   3165     // Compare span() with containsAll()/containsNone(),
   3166     // but only if we have expectLimits[] from the uncomplemented set.
   3167     if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
   3168         const UChar *s16=(const UChar *)s;
   3169         UnicodeString string;
   3170         int32_t prev=0, limit, length;
   3171         for(i=0; i<expectCount; ++i) {
   3172             limit=expectLimits[i];
   3173             length=limit-prev;
   3174             if(length>0) {
   3175                 string.setTo(FALSE, s16+prev, length);  // read-only alias
   3176                 if(i&1) {
   3177                     if(!sets[SLOW]->getSet().containsAll(string)) {
   3178                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
   3179                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
   3180                         return;
   3181                     }
   3182                     if(!sets[FAST]->getSet().containsAll(string)) {
   3183                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
   3184                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
   3185                         return;
   3186                     }
   3187                 } else {
   3188                     if(!sets[SLOW]->getSet().containsNone(string)) {
   3189                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
   3190                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
   3191                         return;
   3192                     }
   3193                     if(!sets[FAST]->getSet().containsNone(string)) {
   3194                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
   3195                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
   3196                         return;
   3197                     }
   3198                 }
   3199             }
   3200             prev=limit;
   3201         }
   3202     }
   3203 }
   3204 
   3205 // Specifically test either UTF-16 or UTF-8.
   3206 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
   3207                               const void *s, int32_t length, UBool isUTF16,
   3208                               uint32_t whichSpans,
   3209                               const char *testName, int32_t index) {
   3210     int32_t expectLimits[500];
   3211     int32_t expectCount=-1;
   3212     testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
   3213 }
   3214 
   3215 UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {
   3216     UChar c, c2;
   3217 
   3218     if(length>=0) {
   3219         while(length>0) {
   3220             c=*s++;
   3221             --length;
   3222             if(0xd800<=c && c<0xe000) {
   3223                 if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) {
   3224                     return TRUE;
   3225                 }
   3226                 --length;
   3227             }
   3228         }
   3229     } else {
   3230         while((c=*s++)!=0) {
   3231             if(0xd800<=c && c<0xe000) {
   3232                 if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) {
   3233                     return TRUE;
   3234                 }
   3235             }
   3236         }
   3237     }
   3238     return FALSE;
   3239 }
   3240 
   3241 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
   3242 // unless either UTF is turned off in whichSpans.
   3243 // Testing UTF-16 and UTF-8 together requires that surrogate code points
   3244 // have the same contains(c) value as U+FFFD.
   3245 void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
   3246                                       const UChar *s16, int32_t length16,
   3247                                       uint32_t whichSpans,
   3248                                       const char *testName, int32_t index) {
   3249     int32_t expectLimits[500];
   3250     int32_t expectCount;
   3251 
   3252     expectCount=-1;  // Get expectLimits[] from testSpan().
   3253 
   3254     if((whichSpans&SPAN_UTF16)!=0) {
   3255         testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index);
   3256     }
   3257     if((whichSpans&SPAN_UTF8)==0) {
   3258         return;
   3259     }
   3260 
   3261     // Convert s16[] and expectLimits[] to UTF-8.
   3262     uint8_t s8[3000];
   3263     int32_t offsets[3000];
   3264 
   3265     const UChar *s16Limit=s16+length16;
   3266     char *t=(char *)s8;
   3267     char *tLimit=t+sizeof(s8);
   3268     int32_t *o=offsets;
   3269     UErrorCode errorCode=U_ZERO_ERROR;
   3270 
   3271     // Convert with substitution: Turn unpaired surrogates into U+FFFD.
   3272     ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode);
   3273     if(U_FAILURE(errorCode)) {
   3274         errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
   3275               testName, (long)index, u_errorName(errorCode));
   3276         ucnv_resetFromUnicode(utf8Cnv);
   3277         return;
   3278     }
   3279     int32_t length8=(int32_t)(t-(char *)s8);
   3280 
   3281     // Convert expectLimits[].
   3282     int32_t i, j, expect;
   3283     for(i=j=0; i<expectCount; ++i) {
   3284         expect=expectLimits[i];
   3285         if(expect==length16) {
   3286             expectLimits[i]=length8;
   3287         } else {
   3288             while(offsets[j]<expect) {
   3289                 ++j;
   3290             }
   3291             expectLimits[i]=j;
   3292         }
   3293     }
   3294 
   3295     testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index);
   3296 }
   3297 
   3298 static UChar32 nextCodePoint(UChar32 c) {
   3299     // Skip some large and boring ranges.
   3300     switch(c) {
   3301     case 0x3441:
   3302         return 0x4d7f;
   3303     case 0x5100:
   3304         return 0x9f00;
   3305     case 0xb040:
   3306         return 0xd780;
   3307     case 0xe041:
   3308         return 0xf8fe;
   3309     case 0x10100:
   3310         return 0x20000;
   3311     case 0x20041:
   3312         return 0xe0000;
   3313     case 0xe0101:
   3314         return 0x10fffd;
   3315     default:
   3316         return c+1;
   3317     }
   3318 }
   3319 
   3320 // Verify that all implementations represent the same set.
   3321 void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
   3322     // contains(U+FFFD) is inconsistent with contains(some surrogates),
   3323     // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
   3324     // Skip the UTF-8 part of the test - if the string contains surrogates -
   3325     // because it is likely to produce a different result.
   3326     UBool inconsistentSurrogates=
   3327             (!(sets[0]->getSet().contains(0xfffd) ?
   3328                sets[0]->getSet().contains(0xd800, 0xdfff) :
   3329                sets[0]->getSet().containsNone(0xd800, 0xdfff)) ||
   3330              sets[0]->hasStringsWithSurrogates());
   3331 
   3332     UChar s[1000];
   3333     int32_t length=0;
   3334     uint32_t localWhichSpans;
   3335 
   3336     UChar32 c, first;
   3337     for(first=c=0;; c=nextCodePoint(c)) {
   3338         if(c>0x10ffff || length>(UPRV_LENGTHOF(s)-U16_MAX_LENGTH)) {
   3339             localWhichSpans=whichSpans;
   3340             if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
   3341                 localWhichSpans&=~SPAN_UTF8;
   3342             }
   3343             testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
   3344             if(c>0x10ffff) {
   3345                 break;
   3346             }
   3347             length=0;
   3348             first=c;
   3349         }
   3350         U16_APPEND_UNSAFE(s, length, c);
   3351     }
   3352 }
   3353 
   3354 // Test with a particular, interesting string.
   3355 // Specify length and try NUL-termination.
   3356 void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
   3357     static const UChar s[]={
   3358         0x61, 0x62, 0x20,                       // Latin, space
   3359         0x3b1, 0x3b2, 0x3b3,                    // Greek
   3360         0xd900,                                 // lead surrogate
   3361         0x3000, 0x30ab, 0x30ad,                 // wide space, Katakana
   3362         0xdc05,                                 // trail surrogate
   3363         0xa0, 0xac00, 0xd7a3,                   // nbsp, Hangul
   3364         0xd900, 0xdc05,                         // unassigned supplementary
   3365         0xd840, 0xdfff, 0xd860, 0xdffe,         // Han supplementary
   3366         0xd7a4, 0xdc05, 0xd900, 0x2028,         // unassigned, surrogates in wrong order, LS
   3367         0                                       // NUL
   3368     };
   3369 
   3370     if((whichSpans&SPAN_UTF16)==0) {
   3371         return;
   3372     }
   3373     testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);
   3374     testSpan(sets, s, UPRV_LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1);
   3375 }
   3376 
   3377 void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
   3378     static const char s[]={
   3379         "abc"                                   // Latin
   3380 
   3381         /* trail byte in lead position */
   3382         "\x80"
   3383 
   3384         " "                                     // space
   3385 
   3386         /* truncated multi-byte sequences */
   3387         "\xd0"
   3388         "\xe0"
   3389         "\xe1"
   3390         "\xed"
   3391         "\xee"
   3392         "\xf0"
   3393         "\xf1"
   3394         "\xf4"
   3395         "\xf8"
   3396         "\xfc"
   3397 
   3398         "\xCE\xB1\xCE\xB2\xCE\xB3"              // Greek
   3399 
   3400         /* trail byte in lead position */
   3401         "\x80"
   3402 
   3403         "\xe0\x80"
   3404         "\xe0\xa0"
   3405         "\xe1\x80"
   3406         "\xed\x80"
   3407         "\xed\xa0"
   3408         "\xee\x80"
   3409         "\xf0\x80"
   3410         "\xf0\x90"
   3411         "\xf1\x80"
   3412         "\xf4\x80"
   3413         "\xf4\x90"
   3414         "\xf8\x80"
   3415         "\xfc\x80"
   3416 
   3417         "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD"  // wide space, Katakana
   3418 
   3419         /* trail byte in lead position */
   3420         "\x80"
   3421 
   3422         "\xf0\x80\x80"
   3423         "\xf0\x90\x80"
   3424         "\xf1\x80\x80"
   3425         "\xf4\x80\x80"
   3426         "\xf4\x90\x80"
   3427         "\xf8\x80\x80"
   3428         "\xfc\x80\x80"
   3429 
   3430         "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3"      // nbsp, Hangul
   3431 
   3432         /* trail byte in lead position */
   3433         "\x80"
   3434 
   3435         "\xf8\x80\x80\x80"
   3436         "\xfc\x80\x80\x80"
   3437 
   3438         "\xF1\x90\x80\x85"                      // unassigned supplementary
   3439 
   3440         /* trail byte in lead position */
   3441         "\x80"
   3442 
   3443         "\xfc\x80\x80\x80\x80"
   3444 
   3445         "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE"      // Han supplementary
   3446 
   3447         /* trail byte in lead position */
   3448         "\x80"
   3449 
   3450         /* complete sequences but non-shortest forms or out of range etc. */
   3451         "\xc0\x80"
   3452         "\xe0\x80\x80"
   3453         "\xed\xa0\x80"
   3454         "\xf0\x80\x80\x80"
   3455         "\xf4\x90\x80\x80"
   3456         "\xf8\x80\x80\x80\x80"
   3457         "\xfc\x80\x80\x80\x80\x80"
   3458         "\xfe"
   3459         "\xff"
   3460 
   3461         /* trail byte in lead position */
   3462         "\x80"
   3463 
   3464         "\xED\x9E\xA4\xE2\x80\xA8"              // unassigned, LS, NUL-terminated
   3465     };
   3466 
   3467     if((whichSpans&SPAN_UTF8)==0) {
   3468         return;
   3469     }
   3470     testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);
   3471     testSpan(sets, s, UPRV_LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1);
   3472 }
   3473 
   3474 // Take a set of span options and multiply them so that
   3475 // each portion only has one of the options a, b and c.
   3476 // If b==0, then the set of options is just modified with mask and a.
   3477 // If b!=0 and c==0, then the set of options is just modified with mask, a and b.
   3478 static int32_t
   3479 addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
   3480                uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
   3481     uint32_t s;
   3482     int32_t i;
   3483 
   3484     for(i=0; i<whichSpansCount; ++i) {
   3485         s=whichSpans[i]&mask;
   3486         whichSpans[i]=s|a;
   3487         if(b!=0) {
   3488             whichSpans[whichSpansCount+i]=s|b;
   3489             if(c!=0) {
   3490                 whichSpans[2*whichSpansCount+i]=s|c;
   3491             }
   3492         }
   3493     }
   3494     return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount;
   3495 }
   3496 
   3497 #define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
   3498 #define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
   3499 #define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
   3500 #define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
   3501 
   3502 void UnicodeSetTest::TestSpan() {
   3503     // "[...]" is a UnicodeSet pattern.
   3504     // "*" performs tests on all Unicode code points and on a selection of
   3505     //   malformed UTF-8/16 strings.
   3506     // "-options" limits the scope of testing for the current set.
   3507     //   By default, the test verifies that equivalent boundaries are found
   3508     //   for UTF-16 and UTF-8, going forward and backward,
   3509     //   alternating USET_SPAN_NOT_CONTAINED with
   3510     //   either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
   3511     //   Single-character options:
   3512     //     8 -- UTF-16 and UTF-8 boundaries may differ.
   3513     //          Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
   3514     //          or the set contains strings with unpaired surrogates
   3515     //          which do not translate to valid UTF-8.
   3516     //     c -- set.span() and set.complement().span() boundaries may differ.
   3517     //          Cause: Set strings are not complemented.
   3518     //     b -- span() and spanBack() boundaries may differ.
   3519     //          Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
   3520     //          and spanBack(USET_SPAN_SIMPLE) are defined to
   3521     //          match with non-overlapping substrings.
   3522     //          For example, with a set containing "ab" and "ba",
   3523     //          span() of "aba" yields boundaries { 0, 2, 3 }
   3524     //          because the initial "ab" matches from 0 to 2,
   3525     //          while spanBack() yields boundaries { 0, 1, 3 }
   3526     //          because the final "ba" matches from 1 to 3.
   3527     //     l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
   3528     //          Cause: Strings in the set overlap, and a longer match may
   3529     //          require a sequence including non-longest substrings.
   3530     //          For example, with a set containing "ab", "abc" and "cd",
   3531     //          span(contained) of "abcd" spans the entire string
   3532     //          but span(longest match) only spans the first 3 characters.
   3533     //   Each "-options" first resets all options and then applies the specified options.
   3534     //   A "-" without options resets the options.
   3535     //   The options are also reset for each new set.
   3536     // Other strings will be spanned.
   3537     static const char *const testdata[]={
   3538         "[:ID_Continue:]",
   3539         "*",
   3540         "[:White_Space:]",
   3541         "*",
   3542         "[]",
   3543         "*",
   3544         "[\\u0000-\\U0010FFFF]",
   3545         "*",
   3546         "[\\u0000\\u0080\\u0800\\U00010000]",
   3547         "*",
   3548         "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
   3549         "*",
   3550         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
   3551         "-c",
   3552         "*",
   3553         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
   3554         "-c",
   3555         "*",
   3556 
   3557         // Overlapping strings cause overlapping attempts to match.
   3558         "[x{xy}{xya}{axy}{ax}]",
   3559         "-cl",
   3560 
   3561         // More repetitions of "xya" would take too long with the recursive
   3562         // reference implementation.
   3563         // containsAll()=FALSE
   3564         // test_string 0x14
   3565         "xx"
   3566         "xyaxyaxyaxya"  // set.complement().span(longest match) will stop here.
   3567         "xx"            // set.complement().span(contained) will stop between the two 'x'es.
   3568         "xyaxyaxyaxya"
   3569         "xx"
   3570         "xyaxyaxyaxya"  // span() ends here.
   3571         "aaa",
   3572 
   3573         // containsAll()=TRUE
   3574         // test_string 0x15
   3575         "xx"
   3576         "xyaxyaxyaxya"
   3577         "xx"
   3578         "xyaxyaxyaxya"
   3579         "xx"
   3580         "xyaxyaxyaxy",
   3581 
   3582         "-bc",
   3583         // test_string 0x17
   3584         "byayaxya",  // span() -> { 4, 7, 8 }  spanBack() -> { 5, 8 }
   3585         "-c",
   3586         "byayaxy",   // span() -> { 4, 7 }     complement.span() -> { 7 }
   3587         "byayax",    // span() -> { 4, 6 }     complement.span() -> { 6 }
   3588         "-",
   3589         "byaya",     // span() -> { 5 }
   3590         "byay",      // span() -> { 4 }
   3591         "bya",       // span() -> { 3 }
   3592 
   3593         // span(longest match) will not span the whole string.
   3594         "[a{ab}{bc}]",
   3595         "-cl",
   3596         // test_string 0x21
   3597         "abc",
   3598 
   3599         "[a{ab}{abc}{cd}]",
   3600         "-cl",
   3601         "acdabcdabccd",
   3602 
   3603         // spanBack(longest match) will not span the whole string.
   3604         "[c{ab}{bc}]",
   3605         "-cl",
   3606         "abc",
   3607 
   3608         "[d{cd}{bcd}{ab}]",
   3609         "-cl",
   3610         "abbcdabcdabd",
   3611 
   3612         // Test with non-ASCII set strings - test proper handling of surrogate pairs
   3613         // and UTF-8 trail bytes.
   3614         // Copies of above test sets and strings, but transliterated to have
   3615         // different code points with similar trail units.
   3616         // Previous: a      b         c            d
   3617         // Unicode:  042B   30AB      200AB        204AB
   3618         // UTF-16:   042B   30AB      D840 DCAB    D841 DCAB
   3619         // UTF-8:    D0 AB  E3 82 AB  F0 A0 82 AB  F0 A0 92 AB
   3620         "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
   3621         "-cl",
   3622         "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
   3623 
   3624         "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
   3625         "-cl",
   3626         "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
   3627 
   3628         // Stress bookkeeping and recursion.
   3629         // The following strings are barely doable with the recursive
   3630         // reference implementation.
   3631         // The not-contained character at the end prevents an early exit from the span().
   3632         "[b{bb}]",
   3633         "-c",
   3634         // test_string 0x33
   3635         "bbbbbbbbbbbbbbbbbbbbbbbb-",
   3636         // On complement sets, span() and spanBack() get different results
   3637         // because b is not in the complement set and there is an odd number of b's
   3638         // in the test string.
   3639         "-bc",
   3640         "bbbbbbbbbbbbbbbbbbbbbbbbb-",
   3641 
   3642         // Test with set strings with an initial or final code point span
   3643         // longer than 254.
   3644         "[a{" _64_a _64_a _64_a _64_a "b}"
   3645           "{a" _64_b _64_b _64_b _64_b "}]",
   3646         "-c",
   3647         _64_a _64_a _64_a _63_a "b",
   3648         _64_a _64_a _64_a _64_a "b",
   3649         _64_a _64_a _64_a _64_a "aaaabbbb",
   3650         "a" _64_b _64_b _64_b _63_b,
   3651         "a" _64_b _64_b _64_b _64_b,
   3652         "aaaabbbb" _64_b _64_b _64_b _64_b,
   3653 
   3654         // Test with strings containing unpaired surrogates.
   3655         // They are not representable in UTF-8, and a leading trail surrogate
   3656         // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
   3657         // U+20001 == \\uD840\\uDC01
   3658         // U+20400 == \\uD841\\uDC00
   3659         "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
   3660         "-8cl",
   3661         "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
   3662     };
   3663     uint32_t whichSpans[96]={ SPAN_ALL };
   3664     int32_t whichSpansCount=1;
   3665 
   3666     UnicodeSet *sets[SET_COUNT]={ NULL };
   3667     const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };
   3668 
   3669     char testName[1024];
   3670     char *testNameLimit=testName;
   3671 
   3672     int32_t i, j;
   3673     for(i=0; i<UPRV_LENGTHOF(testdata); ++i) {
   3674         const char *s=testdata[i];
   3675         if(s[0]=='[') {
   3676             // Create new test sets from this pattern.
   3677             for(j=0; j<SET_COUNT; ++j) {
   3678                 delete sets_with_str[j];
   3679                 delete sets[j];
   3680             }
   3681             UErrorCode errorCode=U_ZERO_ERROR;
   3682             sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
   3683             if(U_FAILURE(errorCode)) {
   3684                 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
   3685                 break;
   3686             }
   3687             sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
   3688             sets[SLOW_NOT]->complement();
   3689             // Intermediate set: Test cloning of a frozen set.
   3690             UnicodeSet *fast=new UnicodeSet(*sets[SLOW]);
   3691             fast->freeze();
   3692             sets[FAST]=(UnicodeSet *)fast->clone();
   3693             delete fast;
   3694             UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]);
   3695             fastNot->freeze();
   3696             sets[FAST_NOT]=(UnicodeSet *)fastNot->clone();
   3697             delete fastNot;
   3698 
   3699             for(j=0; j<SET_COUNT; ++j) {
   3700                 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
   3701             }
   3702 
   3703             strcpy(testName, s);
   3704             testNameLimit=strchr(testName, 0);
   3705             *testNameLimit++=':';
   3706             *testNameLimit=0;
   3707 
   3708             whichSpans[0]=SPAN_ALL;
   3709             whichSpansCount=1;
   3710         } else if(s[0]=='-') {
   3711             whichSpans[0]=SPAN_ALL;
   3712             whichSpansCount=1;
   3713 
   3714             while(*++s!=0) {
   3715                 switch(*s) {
   3716                 case 'c':
   3717                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
   3718                                                    ~SPAN_POLARITY,
   3719                                                    SPAN_SET,
   3720                                                    SPAN_COMPLEMENT,
   3721                                                    0);
   3722                     break;
   3723                 case 'b':
   3724                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
   3725                                                    ~SPAN_DIRS,
   3726                                                    SPAN_FWD,
   3727                                                    SPAN_BACK,
   3728                                                    0);
   3729                     break;
   3730                 case 'l':
   3731                     // test USET_SPAN_CONTAINED FWD & BACK, and separately
   3732                     // USET_SPAN_SIMPLE only FWD, and separately
   3733                     // USET_SPAN_SIMPLE only BACK
   3734                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
   3735                                                    ~(SPAN_DIRS|SPAN_CONDITION),
   3736                                                    SPAN_DIRS|SPAN_CONTAINED,
   3737                                                    SPAN_FWD|SPAN_SIMPLE,
   3738                                                    SPAN_BACK|SPAN_SIMPLE);
   3739                     break;
   3740                 case '8':
   3741                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
   3742                                                    ~SPAN_UTFS,
   3743                                                    SPAN_UTF16,
   3744                                                    SPAN_UTF8,
   3745                                                    0);
   3746                     break;
   3747                 default:
   3748                     errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
   3749                     break;
   3750                 }
   3751             }
   3752         } else if(0==strcmp(s, "*")) {
   3753             strcpy(testNameLimit, "bad_string");
   3754             for(j=0; j<whichSpansCount; ++j) {
   3755                 if(whichSpansCount>1) {
   3756                     sprintf(testNameLimit+10 /* strlen("bad_string") */,
   3757                             "%%0x%3x",
   3758                             whichSpans[j]);
   3759                 }
   3760                 testSpanUTF16String(sets_with_str, whichSpans[j], testName);
   3761                 testSpanUTF8String(sets_with_str, whichSpans[j], testName);
   3762             }
   3763 
   3764             strcpy(testNameLimit, "contents");
   3765             for(j=0; j<whichSpansCount; ++j) {
   3766                 if(whichSpansCount>1) {
   3767                     sprintf(testNameLimit+8 /* strlen("contents") */,
   3768                             "%%0x%3x",
   3769                             whichSpans[j]);
   3770                 }
   3771                 testSpanContents(sets_with_str, whichSpans[j], testName);
   3772             }
   3773         } else {
   3774             UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
   3775             strcpy(testNameLimit, "test_string");
   3776             for(j=0; j<whichSpansCount; ++j) {
   3777                 if(whichSpansCount>1) {
   3778                     sprintf(testNameLimit+11 /* strlen("test_string") */,
   3779                             "%%0x%3x",
   3780                             whichSpans[j]);
   3781                 }
   3782                 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
   3783             }
   3784         }
   3785     }
   3786     for(j=0; j<SET_COUNT; ++j) {
   3787         delete sets_with_str[j];
   3788         delete sets[j];
   3789     }
   3790 }
   3791 
   3792 // Test select patterns and strings, and test USET_SPAN_SIMPLE.
   3793 void UnicodeSetTest::TestStringSpan() {
   3794     static const char *pattern="[x{xy}{xya}{axy}{ax}]";
   3795     static const char *const string=
   3796         "xx"
   3797         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
   3798         "xx"
   3799         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
   3800         "xx"
   3801         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
   3802         "aaaa";
   3803 
   3804     UErrorCode errorCode=U_ZERO_ERROR;
   3805     UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
   3806     UnicodeSet set(pattern16, errorCode);
   3807     if(U_FAILURE(errorCode)) {
   3808         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
   3809         return;
   3810     }
   3811 
   3812     UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
   3813 
   3814     if(set.containsAll(string16)) {
   3815         errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string);
   3816     }
   3817 
   3818     // Remove trailing "aaaa".
   3819     string16.truncate(string16.length()-4);
   3820     if(!set.containsAll(string16)) {
   3821         errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string);
   3822     }
   3823 
   3824     string16=UNICODE_STRING_SIMPLE("byayaxya");
   3825     const UChar *s16=string16.getBuffer();
   3826     int32_t length16=string16.length();
   3827     (void)length16;   // Suppress set but not used warning.
   3828     if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 ||
   3829         set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 ||
   3830         set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 ||
   3831         set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 ||
   3832         set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 ||
   3833         set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
   3834     ) {
   3835         errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
   3836     }
   3837 
   3838     pattern="[a{ab}{abc}{cd}]";
   3839     pattern16=UnicodeString(pattern, -1, US_INV);
   3840     set.applyPattern(pattern16, errorCode);
   3841     if(U_FAILURE(errorCode)) {
   3842         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
   3843         return;
   3844     }
   3845     string16=UNICODE_STRING_SIMPLE("acdabcdabccd");
   3846     s16=string16.getBuffer();
   3847     length16=string16.length();
   3848     if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 ||
   3849         set.span(s16, 12, USET_SPAN_SIMPLE)!=6 ||
   3850         set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
   3851     ) {
   3852         errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
   3853     }
   3854 
   3855     pattern="[d{cd}{bcd}{ab}]";
   3856     pattern16=UnicodeString(pattern, -1, US_INV);
   3857     set.applyPattern(pattern16, errorCode).freeze();
   3858     if(U_FAILURE(errorCode)) {
   3859         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
   3860         return;
   3861     }
   3862     string16=UNICODE_STRING_SIMPLE("abbcdabcdabd");
   3863     s16=string16.getBuffer();
   3864     length16=string16.length();
   3865     if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 ||
   3866         set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 ||
   3867         set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
   3868     ) {
   3869         errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
   3870     }
   3871 }
   3872 
   3873 /**
   3874  * Including collationroot.h fails here with
   3875 1>c:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\include\driverspecs.h(142): error C2008: '$' : unexpected in macro definition
   3876  *  .. so, we skip this test on Windows.
   3877  *
   3878  * the cause is that  intltest builds with /Za which disables language extensions - which means
   3879  *  windows header files can't be used.
   3880  */
   3881 #if !UCONFIG_NO_COLLATION && !U_PLATFORM_HAS_WIN32_API
   3882 #include "collationroot.h"
   3883 #include "collationtailoring.h"
   3884 #endif
   3885 
   3886 void UnicodeSetTest::TestUCAUnsafeBackwards() {
   3887 #if U_PLATFORM_HAS_WIN32_API
   3888     infoln("Skipping TestUCAUnsafeBackwards() - can't include collationroot.h on Windows without language extensions!");
   3889 #elif !UCONFIG_NO_COLLATION
   3890     UErrorCode errorCode = U_ZERO_ERROR;
   3891 
   3892     // Get the unsafeBackwardsSet
   3893     const CollationCacheEntry *rootEntry = CollationRoot::getRootCacheEntry(errorCode);
   3894     if(U_FAILURE(errorCode)) {
   3895       dataerrln("FAIL: %s getting root cache entry", u_errorName(errorCode));
   3896       return;
   3897     }
   3898     //const UVersionInfo &version = rootEntry->tailoring->version;
   3899     const UnicodeSet *unsafeBackwardSet = rootEntry->tailoring->unsafeBackwardSet;
   3900 
   3901     checkSerializeRoundTrip(*unsafeBackwardSet, errorCode);
   3902 
   3903     if(!logKnownIssue("11891","UnicodeSet fails to round trip on CollationRoot...unsafeBackwards set")) {
   3904         // simple test case
   3905         // TODO(ticket #11891): Simplify this test function to this simple case. Rename it appropriately.
   3906         // TODO(ticket #11891): Port test to Java. Is this a bug there, too?
   3907         UnicodeSet surrogates;
   3908         surrogates.add(0xd83a);  // a lead surrogate
   3909         surrogates.add(0xdc00, 0xdfff);  // a range of trail surrogates
   3910         UnicodeString pat;
   3911         surrogates.toPattern(pat, FALSE);  // bad: [ 0xd83a, 0xdc00, 0x2d, 0xdfff ]
   3912         // TODO: Probably fix either UnicodeSet::_generatePattern() or _appendToPat()
   3913         // so that at least one type of surrogate code points are escaped,
   3914         // or (minimally) so that adjacent lead+trail surrogate code points are escaped.
   3915         errorCode = U_ZERO_ERROR;
   3916         UnicodeSet s2;
   3917         s2.applyPattern(pat, errorCode);  // looks like invalid range [ 0x1e800, 0x2d, 0xdfff ]
   3918         if(U_FAILURE(errorCode)) {
   3919             errln("FAIL: surrogates to/from pattern - %s", u_errorName(errorCode));
   3920         } else {
   3921             checkEqual(surrogates, s2, "surrogates to/from pattern");
   3922         }
   3923         // This occurs in the UCA unsafe-backwards set.
   3924         checkRoundTrip(*unsafeBackwardSet);
   3925     }
   3926 #endif
   3927 }
   3928