1 /* 2 ******************************************************************************** 3 * Copyright (C) 1999-2009 International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************************** 6 * Date Name Description 7 * 10/20/99 alan Creation. 8 * 03/22/2000 Madhu Added additional tests 9 ******************************************************************************** 10 */ 11 12 #include <stdio.h> 13 14 #include <string.h> 15 #include "unicode/utypes.h" 16 #include "usettest.h" 17 #include "unicode/ucnv.h" 18 #include "unicode/uniset.h" 19 #include "unicode/uchar.h" 20 #include "unicode/usetiter.h" 21 #include "unicode/ustring.h" 22 #include "unicode/parsepos.h" 23 #include "unicode/symtable.h" 24 #include "unicode/uversion.h" 25 #include "hash.h" 26 27 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 28 29 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \ 30 dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \ 31 u_errorName(status));}} 32 33 #define TEST_ASSERT(expr) {if (!(expr)) { \ 34 dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); }} 35 36 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) { 37 UnicodeString pat; 38 set.toPattern(pat); 39 return left + UnicodeSetTest::escape(pat); 40 } 41 42 #define CASE(id,test) case id: \ 43 name = #test; \ 44 if (exec) { \ 45 logln(#test "---"); \ 46 logln(); \ 47 test(); \ 48 } \ 49 break 50 51 UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) { 52 } 53 54 UConverter *UnicodeSetTest::openUTF8Converter() { 55 if(utf8Cnv==NULL) { 56 UErrorCode errorCode=U_ZERO_ERROR; 57 utf8Cnv=ucnv_open("UTF-8", &errorCode); 58 } 59 return utf8Cnv; 60 } 61 62 UnicodeSetTest::~UnicodeSetTest() { 63 ucnv_close(utf8Cnv); 64 } 65 66 void 67 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec, 68 const char* &name, char* /*par*/) { 69 // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest"); 70 switch (index) { 71 CASE(0,TestPatterns); 72 CASE(1,TestAddRemove); 73 CASE(2,TestCategories); 74 CASE(3,TestCloneEqualHash); 75 CASE(4,TestMinimalRep); 76 CASE(5,TestAPI); 77 CASE(6,TestScriptSet); 78 CASE(7,TestPropertySet); 79 CASE(8,TestClone); 80 CASE(9,TestExhaustive); 81 CASE(10,TestToPattern); 82 CASE(11,TestIndexOf); 83 CASE(12,TestStrings); 84 CASE(13,Testj2268); 85 CASE(14,TestCloseOver); 86 CASE(15,TestEscapePattern); 87 CASE(16,TestInvalidCodePoint); 88 CASE(17,TestSymbolTable); 89 CASE(18,TestSurrogate); 90 CASE(19,TestPosixClasses); 91 CASE(20,TestIteration); 92 CASE(21,TestFreezable); 93 CASE(22,TestSpan); 94 CASE(23,TestStringSpan); 95 default: name = ""; break; 96 } 97 } 98 99 static const char NOT[] = "%%%%"; 100 101 /** 102 * UVector was improperly copying contents 103 * This code will crash this is still true 104 */ 105 void UnicodeSetTest::Testj2268() { 106 UnicodeSet t; 107 t.add(UnicodeString("abc")); 108 UnicodeSet test(t); 109 UnicodeString ustrPat; 110 test.toPattern(ustrPat, TRUE); 111 } 112 113 /** 114 * Test toPattern(). 115 */ 116 void UnicodeSetTest::TestToPattern() { 117 UErrorCode ec = U_ZERO_ERROR; 118 119 // Test that toPattern() round trips with syntax characters and 120 // whitespace. 121 { 122 static const char* OTHER_TOPATTERN_TESTS[] = { 123 "[[:latin:]&[:greek:]]", 124 "[[:latin:]-[:greek:]]", 125 "[:nonspacing mark:]", 126 NULL 127 }; 128 129 for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) { 130 ec = U_ZERO_ERROR; 131 UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec); 132 if (U_FAILURE(ec)) { 133 dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec))); 134 continue; 135 } 136 checkPat(OTHER_TOPATTERN_TESTS[j], s); 137 } 138 139 for (UChar32 i = 0; i <= 0x10FFFF; ++i) { 140 if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) { 141 142 // check various combinations to make sure they all work. 143 if (i != 0 && !toPatternAux(i, i)){ 144 continue; 145 } 146 if (!toPatternAux(0, i)){ 147 continue; 148 } 149 if (!toPatternAux(i, 0xFFFF)){ 150 continue; 151 } 152 } 153 } 154 } 155 156 // Test pattern behavior of multicharacter strings. 157 { 158 ec = U_ZERO_ERROR; 159 UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec); 160 161 // This loop isn't a loop. It's here to make the compiler happy. 162 // If you're curious, try removing it and changing the 'break' 163 // statements (except for the last) to goto's. 164 for (;;) { 165 if (U_FAILURE(ec)) break; 166 const char* exp1[] = {"aa", "ab", NOT, "ac", NULL}; 167 expectToPattern(*s, "[a-z{aa}{ab}]", exp1); 168 169 s->add("ac"); 170 const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL}; 171 expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2); 172 173 s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec); 174 if (U_FAILURE(ec)) break; 175 const char* exp3[] = {"{l", "r}", NOT, "xy", NULL}; 176 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3); 177 178 s->add("[]"); 179 const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL}; 180 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4); 181 182 s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec); 183 if (U_FAILURE(ec)) break; 184 const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL}; 185 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5); 186 187 // j2189 188 s->clear(); 189 s->add(UnicodeString("abc", "")); 190 s->add(UnicodeString("abc", "")); 191 const char* exp6[] = {"abc", NOT, "ab", NULL}; 192 expectToPattern(*s, "[{abc}]", exp6); 193 194 break; 195 } 196 197 if (U_FAILURE(ec)) errln("FAIL: pattern parse error"); 198 delete s; 199 } 200 201 // JB#3400: For 2 character ranges prefer [ab] to [a-b] 202 UnicodeSet s; 203 s.add((UChar)97, (UChar)98); // 'a', 'b' 204 expectToPattern(s, "[ab]", NULL); 205 } 206 207 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) { 208 209 // use Integer.toString because Utility.hex doesn't handle ints 210 UnicodeString pat = ""; 211 // TODO do these in hex 212 //String source = "0x" + Integer.toString(start,16).toUpperCase(); 213 //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase(); 214 UnicodeString source; 215 source = source + (uint32_t)start; 216 if (start != end) 217 source = source + ".." + (uint32_t)end; 218 UnicodeSet testSet; 219 testSet.add(start, end); 220 return checkPat(source, testSet); 221 } 222 223 UBool UnicodeSetTest::checkPat(const UnicodeString& source, 224 const UnicodeSet& testSet) { 225 // What we want to make sure of is that a pattern generated 226 // by toPattern(), with or without escaped unprintables, can 227 // be passed back into the UnicodeSet constructor. 228 UnicodeString pat0; 229 230 testSet.toPattern(pat0, TRUE); 231 232 if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE; 233 234 //String pat1 = unescapeLeniently(pat0); 235 //if (!checkPat(source + " (in code)", testSet, pat1)) return false; 236 237 UnicodeString pat2; 238 testSet.toPattern(pat2, FALSE); 239 if (!checkPat(source, testSet, pat2)) return FALSE; 240 241 //String pat3 = unescapeLeniently(pat2); 242 // if (!checkPat(source + " (in code)", testSet, pat3)) return false; 243 244 //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3); 245 logln((UnicodeString)source + " => " + pat0 + ", " + pat2); 246 return TRUE; 247 } 248 249 UBool UnicodeSetTest::checkPat(const UnicodeString& source, 250 const UnicodeSet& testSet, 251 const UnicodeString& pat) { 252 UErrorCode ec = U_ZERO_ERROR; 253 UnicodeSet testSet2(pat, ec); 254 if (testSet2 != testSet) { 255 errln((UnicodeString)"Fail toPattern: " + source + " => " + pat); 256 return FALSE; 257 } 258 return TRUE; 259 } 260 261 void 262 UnicodeSetTest::TestPatterns(void) { 263 UnicodeSet set; 264 expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""), "km"); 265 expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""), "aczz"); 266 expectPattern(set, UnicodeString("[a\\-z]", ""), "--aazz"); 267 expectPattern(set, UnicodeString("[-az]", ""), "--aazz"); 268 expectPattern(set, UnicodeString("[az-]", ""), "--aazz"); 269 expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz"); 270 271 // Throw in a test of complement 272 set.complement(); 273 UnicodeString exp; 274 exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF); 275 expectPairs(set, exp); 276 } 277 278 void 279 UnicodeSetTest::TestCategories(void) { 280 UErrorCode status = U_ZERO_ERROR; 281 const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:] 282 UnicodeSet set(pat, status); 283 if (U_FAILURE(status)) { 284 dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status))); 285 return; 286 } else { 287 expectContainment(set, pat, "ABC", "abc"); 288 } 289 290 UChar32 i; 291 int32_t failures = 0; 292 // Make sure generation of L doesn't pollute cached Lu set 293 // First generate L, then Lu 294 set.applyPattern("[:L:]", status); 295 if (U_FAILURE(status)) { errln("FAIL"); return; } 296 for (i=0; i<0x200; ++i) { 297 UBool l = u_isalpha((UChar)i); 298 if (l != set.contains(i)) { 299 errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " + 300 set.contains(i)); 301 if (++failures == 10) break; 302 } 303 } 304 305 set.applyPattern("[:Lu:]", status); 306 if (U_FAILURE(status)) { errln("FAIL"); return; } 307 for (i=0; i<0x200; ++i) { 308 UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER); 309 if (lu != set.contains(i)) { 310 errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " + 311 set.contains(i)); 312 if (++failures == 20) break; 313 } 314 } 315 } 316 void 317 UnicodeSetTest::TestCloneEqualHash(void) { 318 UErrorCode status = U_ZERO_ERROR; 319 // set1 and set2 used to be built with the obsolete constructor taking 320 // UCharCategory values; replaced with pattern constructors 321 // markus 20030502 322 UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status); // :Ll: Letter, lowercase 323 UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); // Letter, lowercase 324 if (U_FAILURE(status)){ 325 dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status))); 326 return; 327 } 328 UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status); //Number, Decimal digit 329 UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status); //Number, Decimal digit 330 if (U_FAILURE(status)){ 331 errln((UnicodeString)"FAIL: Can't construct set with category->Nd"); 332 return; 333 } 334 335 if (*set1 != *set1a) { 336 errln("FAIL: category constructor for Ll broken"); 337 } 338 if (*set2 != *set2a) { 339 errln("FAIL: category constructor for Nd broken"); 340 } 341 delete set1a; 342 delete set2a; 343 344 logln("Testing copy construction"); 345 UnicodeSet *set1copy=new UnicodeSet(*set1); 346 if(*set1 != *set1copy || *set1 == *set2 || 347 getPairs(*set1) != getPairs(*set1copy) || 348 set1->hashCode() != set1copy->hashCode()){ 349 errln("FAIL : Error in copy construction"); 350 return; 351 } 352 353 logln("Testing =operator"); 354 UnicodeSet set1equal=*set1; 355 UnicodeSet set2equal=*set2; 356 if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 || 357 set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){ 358 errln("FAIL: Error in =operator"); 359 } 360 361 logln("Testing clone()"); 362 UnicodeSet *set1clone=(UnicodeSet*)set1->clone(); 363 UnicodeSet *set2clone=(UnicodeSet*)set2->clone(); 364 if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal || 365 *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal || 366 *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){ 367 errln("FAIL: Error in clone"); 368 } 369 370 logln("Testing hashcode"); 371 if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() || 372 set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() || 373 set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() || 374 set1->hashCode() == set2->hashCode() || set1copy->hashCode() == set2->hashCode() || 375 set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){ 376 errln("FAIL: Error in hashCode()"); 377 } 378 379 delete set1; 380 delete set1copy; 381 delete set2; 382 delete set1clone; 383 delete set2clone; 384 385 386 } 387 void 388 UnicodeSetTest::TestAddRemove(void) { 389 UnicodeSet set; // Construct empty set 390 doAssert(set.isEmpty() == TRUE, "set should be empty"); 391 doAssert(set.size() == 0, "size should be 0"); 392 set.complement(); 393 doAssert(set.size() == 0x110000, "size should be 0x110000"); 394 set.clear(); 395 set.add(0x0061, 0x007a); 396 expectPairs(set, "az"); 397 doAssert(set.isEmpty() == FALSE, "set should not be empty"); 398 doAssert(set.size() != 0, "size should not be equal to 0"); 399 doAssert(set.size() == 26, "size should be equal to 26"); 400 set.remove(0x006d, 0x0070); 401 expectPairs(set, "alqz"); 402 doAssert(set.size() == 22, "size should be equal to 22"); 403 set.remove(0x0065, 0x0067); 404 expectPairs(set, "adhlqz"); 405 doAssert(set.size() == 19, "size should be equal to 19"); 406 set.remove(0x0064, 0x0069); 407 expectPairs(set, "acjlqz"); 408 doAssert(set.size() == 16, "size should be equal to 16"); 409 set.remove(0x0063, 0x0072); 410 expectPairs(set, "absz"); 411 doAssert(set.size() == 10, "size should be equal to 10"); 412 set.add(0x0066, 0x0071); 413 expectPairs(set, "abfqsz"); 414 doAssert(set.size() == 22, "size should be equal to 22"); 415 set.remove(0x0061, 0x0067); 416 expectPairs(set, "hqsz"); 417 set.remove(0x0061, 0x007a); 418 expectPairs(set, ""); 419 doAssert(set.isEmpty() == TRUE, "set should be empty"); 420 doAssert(set.size() == 0, "size should be 0"); 421 set.add(0x0061); 422 doAssert(set.isEmpty() == FALSE, "set should not be empty"); 423 doAssert(set.size() == 1, "size should not be equal to 1"); 424 set.add(0x0062); 425 set.add(0x0063); 426 expectPairs(set, "ac"); 427 doAssert(set.size() == 3, "size should not be equal to 3"); 428 set.add(0x0070); 429 set.add(0x0071); 430 expectPairs(set, "acpq"); 431 doAssert(set.size() == 5, "size should not be equal to 5"); 432 set.clear(); 433 expectPairs(set, ""); 434 doAssert(set.isEmpty() == TRUE, "set should be empty"); 435 doAssert(set.size() == 0, "size should be 0"); 436 437 // Try removing an entire set from another set 438 expectPattern(set, "[c-x]", "cx"); 439 UnicodeSet set2; 440 expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz"); 441 set.removeAll(set2); 442 expectPairs(set, "deluxx"); 443 444 // Try adding an entire set to another set 445 expectPattern(set, "[jackiemclean]", "aacceein"); 446 expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort"); 447 set.addAll(set2); 448 expectPairs(set, "aacehort"); 449 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2"); 450 451 // Try retaining an set of elements contained in another set (intersection) 452 UnicodeSet set3; 453 expectPattern(set3, "[a-c]", "ac"); 454 doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3"); 455 set3.remove(0x0062); 456 expectPairs(set3, "aacc"); 457 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3"); 458 set.retainAll(set3); 459 expectPairs(set, "aacc"); 460 doAssert(set.size() == set3.size(), "set.size() should be set3.size()"); 461 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3"); 462 set.clear(); 463 doAssert(set.size() != set3.size(), "set.size() != set3.size()"); 464 465 // Test commutativity 466 expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort"); 467 expectPattern(set2, "[jackiemclean]", "aacceein"); 468 set.addAll(set2); 469 expectPairs(set, "aacehort"); 470 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2"); 471 472 473 474 475 } 476 477 /** 478 * Make sure minimal representation is maintained. 479 */ 480 void UnicodeSetTest::TestMinimalRep() { 481 UErrorCode status = U_ZERO_ERROR; 482 // This is pretty thoroughly tested by checkCanonicalRep() 483 // run against the exhaustive operation results. Use the code 484 // here for debugging specific spot problems. 485 486 // 1 overlap against 2 487 UnicodeSet set("[h-km-q]", status); 488 if (U_FAILURE(status)) { errln("FAIL"); return; } 489 UnicodeSet set2("[i-o]", status); 490 if (U_FAILURE(status)) { errln("FAIL"); return; } 491 set.addAll(set2); 492 expectPairs(set, "hq"); 493 // right 494 set.applyPattern("[a-m]", status); 495 if (U_FAILURE(status)) { errln("FAIL"); return; } 496 set2.applyPattern("[e-o]", status); 497 if (U_FAILURE(status)) { errln("FAIL"); return; } 498 set.addAll(set2); 499 expectPairs(set, "ao"); 500 // left 501 set.applyPattern("[e-o]", status); 502 if (U_FAILURE(status)) { errln("FAIL"); return; } 503 set2.applyPattern("[a-m]", status); 504 if (U_FAILURE(status)) { errln("FAIL"); return; } 505 set.addAll(set2); 506 expectPairs(set, "ao"); 507 // 1 overlap against 3 508 set.applyPattern("[a-eg-mo-w]", status); 509 if (U_FAILURE(status)) { errln("FAIL"); return; } 510 set2.applyPattern("[d-q]", status); 511 if (U_FAILURE(status)) { errln("FAIL"); return; } 512 set.addAll(set2); 513 expectPairs(set, "aw"); 514 } 515 516 void UnicodeSetTest::TestAPI() { 517 UErrorCode status = U_ZERO_ERROR; 518 // default ct 519 UnicodeSet set; 520 if (!set.isEmpty() || set.getRangeCount() != 0) { 521 errln((UnicodeString)"FAIL, set should be empty but isn't: " + 522 set); 523 } 524 525 // clear(), isEmpty() 526 set.add(0x0061); 527 if (set.isEmpty()) { 528 errln((UnicodeString)"FAIL, set shouldn't be empty but is: " + 529 set); 530 } 531 set.clear(); 532 if (!set.isEmpty()) { 533 errln((UnicodeString)"FAIL, set should be empty but isn't: " + 534 set); 535 } 536 537 // size() 538 set.clear(); 539 if (set.size() != 0) { 540 errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() + 541 ": " + set); 542 } 543 set.add(0x0061); 544 if (set.size() != 1) { 545 errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() + 546 ": " + set); 547 } 548 set.add(0x0031, 0x0039); 549 if (set.size() != 10) { 550 errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() + 551 ": " + set); 552 } 553 554 // contains(first, last) 555 set.clear(); 556 set.applyPattern("[A-Y 1-8 b-d l-y]", status); 557 if (U_FAILURE(status)) { errln("FAIL"); return; } 558 for (int32_t i = 0; i<set.getRangeCount(); ++i) { 559 UChar32 a = set.getRangeStart(i); 560 UChar32 b = set.getRangeEnd(i); 561 if (!set.contains(a, b)) { 562 errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b + 563 " but doesn't: " + set); 564 } 565 if (set.contains((UChar32)(a-1), b)) { 566 errln((UnicodeString)"FAIL, shouldn't contain " + 567 (unsigned short)(a-1) + '-' + (unsigned short)b + 568 " but does: " + set); 569 } 570 if (set.contains(a, (UChar32)(b+1))) { 571 errln((UnicodeString)"FAIL, shouldn't contain " + 572 (unsigned short)a + '-' + (unsigned short)(b+1) + 573 " but does: " + set); 574 } 575 } 576 577 // Ported InversionList test. 578 UnicodeSet a((UChar32)3,(UChar32)10); 579 UnicodeSet b((UChar32)7,(UChar32)15); 580 UnicodeSet c; 581 582 logln((UnicodeString)"a [3-10]: " + a); 583 logln((UnicodeString)"b [7-15]: " + b); 584 c = a; 585 c.addAll(b); 586 UnicodeSet exp((UChar32)3,(UChar32)15); 587 if (c == exp) { 588 logln((UnicodeString)"c.set(a).add(b): " + c); 589 } else { 590 errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp); 591 } 592 c.complement(); 593 exp.set((UChar32)0, (UChar32)2); 594 exp.add((UChar32)16, UnicodeSet::MAX_VALUE); 595 if (c == exp) { 596 logln((UnicodeString)"c.complement(): " + c); 597 } else { 598 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp); 599 } 600 c.complement(); 601 exp.set((UChar32)3, (UChar32)15); 602 if (c == exp) { 603 logln((UnicodeString)"c.complement(): " + c); 604 } else { 605 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp); 606 } 607 c = a; 608 c.complementAll(b); 609 exp.set((UChar32)3,(UChar32)6); 610 exp.add((UChar32)11,(UChar32) 15); 611 if (c == exp) { 612 logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c); 613 } else { 614 errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp); 615 } 616 617 exp = c; 618 bitsToSet(setToBits(c), c); 619 if (c == exp) { 620 logln((UnicodeString)"bitsToSet(setToBits(c)): " + c); 621 } else { 622 errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp); 623 } 624 625 // Additional tests for coverage JB#2118 626 //UnicodeSet::complement(class UnicodeString const &) 627 //UnicodeSet::complementAll(class UnicodeString const &) 628 //UnicodeSet::containsNone(class UnicodeSet const &) 629 //UnicodeSet::containsNone(long,long) 630 //UnicodeSet::containsSome(class UnicodeSet const &) 631 //UnicodeSet::containsSome(long,long) 632 //UnicodeSet::removeAll(class UnicodeString const &) 633 //UnicodeSet::retain(long) 634 //UnicodeSet::retainAll(class UnicodeString const &) 635 //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &) 636 //UnicodeSetIterator::getString(void) 637 set.clear(); 638 set.complement("ab"); 639 exp.applyPattern("[{ab}]", status); 640 if (U_FAILURE(status)) { errln("FAIL"); return; } 641 if (set != exp) { errln("FAIL: complement(\"ab\")"); return; } 642 643 UnicodeSetIterator iset(set); 644 if (!iset.next() || !iset.isString()) { 645 errln("FAIL: UnicodeSetIterator::next/isString"); 646 } else if (iset.getString() != "ab") { 647 errln("FAIL: UnicodeSetIterator::getString"); 648 } 649 650 set.add((UChar32)0x61, (UChar32)0x7A); 651 set.complementAll("alan"); 652 exp.applyPattern("[{ab}b-kmo-z]", status); 653 if (U_FAILURE(status)) { errln("FAIL"); return; } 654 if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; } 655 656 exp.applyPattern("[a-z]", status); 657 if (U_FAILURE(status)) { errln("FAIL"); return; } 658 if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); } 659 if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); } 660 exp.applyPattern("[aln]", status); 661 if (U_FAILURE(status)) { errln("FAIL"); return; } 662 if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); } 663 if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); } 664 665 if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) { 666 errln("FAIL: containsNone(UChar32, UChar32)"); 667 } 668 if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) { 669 errln("FAIL: containsSome(UChar32, UChar32)"); 670 } 671 if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) { 672 errln("FAIL: containsNone(UChar32, UChar32)"); 673 } 674 if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) { 675 errln("FAIL: containsSome(UChar32, UChar32)"); 676 } 677 678 set.removeAll("liu"); 679 exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status); 680 if (U_FAILURE(status)) { errln("FAIL"); return; } 681 if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; } 682 683 set.retainAll("star"); 684 exp.applyPattern("[rst]", status); 685 if (U_FAILURE(status)) { errln("FAIL"); return; } 686 if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; } 687 688 set.retain((UChar32)0x73); 689 exp.applyPattern("[s]", status); 690 if (U_FAILURE(status)) { errln("FAIL"); return; } 691 if (set != exp) { errln("FAIL: retain('s')"); return; } 692 693 uint16_t buf[32]; 694 int32_t slen = set.serialize(buf, sizeof(buf)/sizeof(buf[0]), status); 695 if (U_FAILURE(status)) { errln("FAIL: serialize"); return; } 696 if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) { 697 errln("FAIL: serialize"); 698 return; 699 } 700 701 // Conversions to and from USet 702 UnicodeSet *uniset = &set; 703 USet *uset = uniset->toUSet(); 704 TEST_ASSERT((void *)uset == (void *)uniset); 705 UnicodeSet *setx = UnicodeSet::fromUSet(uset); 706 TEST_ASSERT((void *)setx == (void *)uset); 707 const UnicodeSet *constSet = uniset; 708 const USet *constUSet = constSet->toUSet(); 709 TEST_ASSERT((void *)constUSet == (void *)constSet); 710 const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet); 711 TEST_ASSERT((void *)constSetx == (void *)constUSet); 712 } 713 714 void UnicodeSetTest::TestIteration() { 715 UErrorCode ec = U_ZERO_ERROR; 716 int i = 0; 717 int outerLoop; 718 719 // 6 code points, 3 ranges, 2 strings, 8 total elements 720 // Iteration will access them in sorted order - a, b, c, y, z, U0001abcd, "str1", "str2" 721 UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec); 722 TEST_ASSERT_SUCCESS(ec); 723 UnicodeSetIterator it(set); 724 725 for (outerLoop=0; outerLoop<3; outerLoop++) { 726 // Run the test multiple times, to check that iterator.reset() is working. 727 for (i=0; i<10; i++) { 728 UBool nextv = it.next(); 729 UBool isString = it.isString(); 730 int32_t codePoint = it.getCodepoint(); 731 //int32_t codePointEnd = it.getCodepointEnd(); 732 UnicodeString s = it.getString(); 733 switch (i) { 734 case 0: 735 TEST_ASSERT(nextv == TRUE); 736 TEST_ASSERT(isString == FALSE); 737 TEST_ASSERT(codePoint==0x61); 738 TEST_ASSERT(s == "a"); 739 break; 740 case 1: 741 TEST_ASSERT(nextv == TRUE); 742 TEST_ASSERT(isString == FALSE); 743 TEST_ASSERT(codePoint==0x62); 744 TEST_ASSERT(s == "b"); 745 break; 746 case 2: 747 TEST_ASSERT(nextv == TRUE); 748 TEST_ASSERT(isString == FALSE); 749 TEST_ASSERT(codePoint==0x63); 750 TEST_ASSERT(s == "c"); 751 break; 752 case 3: 753 TEST_ASSERT(nextv == TRUE); 754 TEST_ASSERT(isString == FALSE); 755 TEST_ASSERT(codePoint==0x79); 756 TEST_ASSERT(s == "y"); 757 break; 758 case 4: 759 TEST_ASSERT(nextv == TRUE); 760 TEST_ASSERT(isString == FALSE); 761 TEST_ASSERT(codePoint==0x7a); 762 TEST_ASSERT(s == "z"); 763 break; 764 case 5: 765 TEST_ASSERT(nextv == TRUE); 766 TEST_ASSERT(isString == FALSE); 767 TEST_ASSERT(codePoint==0x1abcd); 768 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd)); 769 break; 770 case 6: 771 TEST_ASSERT(nextv == TRUE); 772 TEST_ASSERT(isString == TRUE); 773 TEST_ASSERT(s == "str1"); 774 break; 775 case 7: 776 TEST_ASSERT(nextv == TRUE); 777 TEST_ASSERT(isString == TRUE); 778 TEST_ASSERT(s == "str2"); 779 break; 780 case 8: 781 TEST_ASSERT(nextv == FALSE); 782 break; 783 case 9: 784 TEST_ASSERT(nextv == FALSE); 785 break; 786 } 787 } 788 it.reset(); // prepare to run the iteration again. 789 } 790 } 791 792 793 794 795 void UnicodeSetTest::TestStrings() { 796 UErrorCode ec = U_ZERO_ERROR; 797 798 UnicodeSet* testList[] = { 799 UnicodeSet::createFromAll("abc"), 800 new UnicodeSet("[a-c]", ec), 801 802 &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")), 803 new UnicodeSet("[{ll}{ch}a-z]", ec), 804 805 UnicodeSet::createFrom("ab}c"), 806 new UnicodeSet("[{ab\\}c}]", ec), 807 808 &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')), 809 new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec), 810 811 NULL 812 }; 813 814 if (U_FAILURE(ec)) { 815 errln("FAIL: couldn't construct test sets"); 816 } 817 818 for (int32_t i = 0; testList[i] != NULL; i+=2) { 819 if (U_SUCCESS(ec)) { 820 UnicodeString pat0, pat1; 821 testList[i]->toPattern(pat0, TRUE); 822 testList[i+1]->toPattern(pat1, TRUE); 823 if (*testList[i] == *testList[i+1]) { 824 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1); 825 } else { 826 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1); 827 } 828 } 829 delete testList[i]; 830 delete testList[i+1]; 831 } 832 } 833 834 /** 835 * Test the [:Latin:] syntax. 836 */ 837 void UnicodeSetTest::TestScriptSet() { 838 expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1")); 839 840 expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA"); 841 842 /* Jitterbug 1423 */ 843 expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA"); 844 845 } 846 847 /** 848 * Test the [:Latin:] syntax. 849 */ 850 void UnicodeSetTest::TestPropertySet() { 851 static const char* const DATA[] = { 852 // Pattern, Chars IN, Chars NOT in 853 854 "[:Latin:]", 855 "aA", 856 "\\u0391\\u03B1", 857 858 "[\\p{Greek}]", 859 "\\u0391\\u03B1", 860 "aA", 861 862 "\\P{ GENERAL Category = upper case letter }", 863 "abc", 864 "ABC", 865 866 // Combining class: @since ICU 2.2 867 // Check both symbolic and numeric 868 "\\p{ccc=Nukta}", 869 "\\u0ABC", 870 "abc", 871 872 "\\p{Canonical Combining Class = 11}", 873 "\\u05B1", 874 "\\u05B2", 875 876 "[:c c c = iota subscript :]", 877 "\\u0345", 878 "xyz", 879 880 // Bidi class: @since ICU 2.2 881 "\\p{bidiclass=lefttoright}", 882 "abc", 883 "\\u0671\\u0672", 884 885 // Binary properties: @since ICU 2.2 886 "\\p{ideographic}", 887 "\\u4E0A", 888 "x", 889 890 "[:math=false:]", 891 "q)*(", 892 // weiv: )(and * were removed from math in Unicode 4.0.1 893 //"(*+)", 894 "+<>^", 895 896 // JB#1767 \N{}, \p{ASCII} 897 "[:Ascii:]", 898 "abc\\u0000\\u007F", 899 "\\u0080\\u4E00", 900 901 "[\\N{ latin small letter a }[:name= latin small letter z:]]", 902 "az", 903 "qrs", 904 905 // JB#2015 906 "[:any:]", 907 "a\\U0010FFFF", 908 "", 909 910 "[:nv=0.5:]", 911 "\\u00BD\\u0F2A", 912 "\\u00BC", 913 914 // JB#2653: Age 915 "[:Age=1.1:]", 916 "\\u03D6", // 1.1 917 "\\u03D8\\u03D9", // 3.2 918 919 "[:Age=3.1:]", 920 "\\u1800\\u3400\\U0002f800", 921 "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000", 922 923 // JB#2350: Case_Sensitive 924 "[:Case Sensitive:]", 925 "A\\u1FFC\\U00010410", 926 ";\\u00B4\\U00010500", 927 928 // JB#2832: C99-compatibility props 929 "[:blank:]", 930 " \\u0009", 931 "1-9A-Z", 932 933 "[:graph:]", 934 "19AZ", 935 " \\u0003\\u0007\\u0009\\u000A\\u000D", 936 937 "[:punct:]", 938 "!@#%&*()[]{}-_\\/;:,.?'\"", 939 "09azAZ", 940 941 "[:xdigit:]", 942 "09afAF", 943 "gG!", 944 945 // Regex compatibility test 946 "[-b]", // leading '-' is literal 947 "-b", 948 "ac", 949 950 "[^-b]", // leading '-' is literal 951 "ac", 952 "-b", 953 954 "[b-]", // trailing '-' is literal 955 "-b", 956 "ac", 957 958 "[^b-]", // trailing '-' is literal 959 "ac", 960 "-b", 961 962 "[a-b-]", // trailing '-' is literal 963 "ab-", 964 "c=", 965 966 "[[a-q]&[p-z]-]", // trailing '-' is literal 967 "pq-", 968 "or=", 969 970 "[\\s|\\)|:|$|\\>]", // from regex tests 971 "s|):$>", 972 "abc", 973 974 "[\\uDC00cd]", // JB#2906: isolated trail at start 975 "cd\\uDC00", 976 "ab\\uD800\\U00010000", 977 978 "[ab\\uD800]", // JB#2906: isolated trail at start 979 "ab\\uD800", 980 "cd\\uDC00\\U00010000", 981 982 "[ab\\uD800cd]", // JB#2906: isolated lead in middle 983 "abcd\\uD800", 984 "ef\\uDC00\\U00010000", 985 986 "[ab\\uDC00cd]", // JB#2906: isolated trail in middle 987 "abcd\\uDC00", 988 "ef\\uD800\\U00010000", 989 990 "[:^lccc=0:]", // Lead canonical class 991 "\\u0300\\u0301", 992 "abcd\\u00c0\\u00c5", 993 994 "[:^tccc=0:]", // Trail canonical class 995 "\\u0300\\u0301\\u00c0\\u00c5", 996 "abcd", 997 998 "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class 999 "\\u0300\\u0301\\u00c0\\u00c5", 1000 "abcd", 1001 1002 "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now) 1003 "", 1004 "abcd\\u0300\\u0301\\u00c0\\u00c5", 1005 1006 "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not 1007 "\\u0F73\\u0F75\\u0F81", 1008 "abcd\\u0300\\u0301\\u00c0\\u00c5", 1009 1010 "[:Assigned:]", 1011 "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD", 1012 "\\u0888\\uFDD3\\uFFFE\\U00050005" 1013 }; 1014 1015 static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]); 1016 1017 for (int32_t i=0; i<DATA_LEN; i+=3) { 1018 expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]), 1019 CharsToUnicodeString(DATA[i+2])); 1020 } 1021 } 1022 1023 /** 1024 * Test that Posix style character classes [:digit:], etc. 1025 * have the Unicode definitions from TR 18. 1026 */ 1027 void UnicodeSetTest::TestPosixClasses() { 1028 { 1029 UErrorCode status = U_ZERO_ERROR; 1030 UnicodeSet s1("[:alpha:]", status); 1031 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status); 1032 TEST_ASSERT_SUCCESS(status); 1033 TEST_ASSERT(s1==s2); 1034 } 1035 { 1036 UErrorCode status = U_ZERO_ERROR; 1037 UnicodeSet s1("[:lower:]", status); 1038 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status); 1039 TEST_ASSERT_SUCCESS(status); 1040 TEST_ASSERT(s1==s2); 1041 } 1042 { 1043 UErrorCode status = U_ZERO_ERROR; 1044 UnicodeSet s1("[:upper:]", status); 1045 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status); 1046 TEST_ASSERT_SUCCESS(status); 1047 TEST_ASSERT(s1==s2); 1048 } 1049 { 1050 UErrorCode status = U_ZERO_ERROR; 1051 UnicodeSet s1("[:punct:]", status); 1052 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status); 1053 TEST_ASSERT_SUCCESS(status); 1054 TEST_ASSERT(s1==s2); 1055 } 1056 { 1057 UErrorCode status = U_ZERO_ERROR; 1058 UnicodeSet s1("[:digit:]", status); 1059 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status); 1060 TEST_ASSERT_SUCCESS(status); 1061 TEST_ASSERT(s1==s2); 1062 } 1063 { 1064 UErrorCode status = U_ZERO_ERROR; 1065 UnicodeSet s1("[:xdigit:]", status); 1066 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status); 1067 TEST_ASSERT_SUCCESS(status); 1068 TEST_ASSERT(s1==s2); 1069 } 1070 { 1071 UErrorCode status = U_ZERO_ERROR; 1072 UnicodeSet s1("[:alnum:]", status); 1073 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status); 1074 TEST_ASSERT_SUCCESS(status); 1075 TEST_ASSERT(s1==s2); 1076 } 1077 { 1078 UErrorCode status = U_ZERO_ERROR; 1079 UnicodeSet s1("[:space:]", status); 1080 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status); 1081 TEST_ASSERT_SUCCESS(status); 1082 TEST_ASSERT(s1==s2); 1083 } 1084 { 1085 UErrorCode status = U_ZERO_ERROR; 1086 UnicodeSet s1("[:blank:]", status); 1087 TEST_ASSERT_SUCCESS(status); 1088 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"), 1089 status); 1090 TEST_ASSERT_SUCCESS(status); 1091 TEST_ASSERT(s1==s2); 1092 } 1093 { 1094 UErrorCode status = U_ZERO_ERROR; 1095 UnicodeSet s1("[:cntrl:]", status); 1096 TEST_ASSERT_SUCCESS(status); 1097 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status); 1098 TEST_ASSERT_SUCCESS(status); 1099 TEST_ASSERT(s1==s2); 1100 } 1101 { 1102 UErrorCode status = U_ZERO_ERROR; 1103 UnicodeSet s1("[:graph:]", status); 1104 TEST_ASSERT_SUCCESS(status); 1105 UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status); 1106 TEST_ASSERT_SUCCESS(status); 1107 TEST_ASSERT(s1==s2); 1108 } 1109 { 1110 UErrorCode status = U_ZERO_ERROR; 1111 UnicodeSet s1("[:print:]", status); 1112 TEST_ASSERT_SUCCESS(status); 1113 UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status); 1114 TEST_ASSERT_SUCCESS(status); 1115 TEST_ASSERT(s1==s2); 1116 } 1117 } 1118 /** 1119 * Test cloning of UnicodeSet. For C++, we test the copy constructor. 1120 */ 1121 void UnicodeSetTest::TestClone() { 1122 UErrorCode ec = U_ZERO_ERROR; 1123 UnicodeSet s("[abcxyz]", ec); 1124 UnicodeSet t(s); 1125 expectContainment(t, "abc", "def"); 1126 } 1127 1128 /** 1129 * Test the indexOf() and charAt() methods. 1130 */ 1131 void UnicodeSetTest::TestIndexOf() { 1132 UErrorCode ec = U_ZERO_ERROR; 1133 UnicodeSet set("[a-cx-y3578]", ec); 1134 if (U_FAILURE(ec)) { 1135 errln("FAIL: UnicodeSet constructor"); 1136 return; 1137 } 1138 for (int32_t i=0; i<set.size(); ++i) { 1139 UChar32 c = set.charAt(i); 1140 if (set.indexOf(c) != i) { 1141 errln("FAIL: charAt(%d) = %X => indexOf() => %d", 1142 i, c, set.indexOf(c)); 1143 } 1144 } 1145 UChar32 c = set.charAt(set.size()); 1146 if (c != -1) { 1147 errln("FAIL: charAt(<out of range>) = %X", c); 1148 } 1149 int32_t j = set.indexOf((UChar32)0x71/*'q'*/); 1150 if (j != -1) { 1151 errln((UnicodeString)"FAIL: indexOf('q') = " + j); 1152 } 1153 } 1154 1155 /** 1156 * Test closure API. 1157 */ 1158 void UnicodeSetTest::TestCloseOver() { 1159 UErrorCode ec = U_ZERO_ERROR; 1160 1161 char CASE[] = {(char)USET_CASE_INSENSITIVE}; 1162 char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS}; 1163 const char* DATA[] = { 1164 // selector, input, output 1165 CASE, 1166 "[aq\\u00DF{Bc}{bC}{Fi}]", 1167 "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]", // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1 1168 1169 CASE, 1170 "[\\u01F1]", // 'DZ' 1171 "[\\u01F1\\u01F2\\u01F3]", 1172 1173 CASE, 1174 "[\\u1FB4]", 1175 "[\\u1FB4{\\u03AC\\u03B9}]", 1176 1177 CASE, 1178 "[{F\\uFB01}]", 1179 "[\\uFB03{ffi}]", 1180 1181 CASE, // make sure binary search finds limits 1182 "[a\\uFF3A]", 1183 "[aA\\uFF3A\\uFF5A]", 1184 1185 CASE, 1186 "[a-z]","[A-Za-z\\u017F\\u212A]", 1187 CASE, 1188 "[abc]","[A-Ca-c]", 1189 CASE, 1190 "[ABC]","[A-Ca-c]", 1191 1192 CASE, "[i]", "[iI]", 1193 1194 CASE, "[\\u0130]", "[\\u0130{i\\u0307}]", // dotted I 1195 CASE, "[{i\\u0307}]", "[\\u0130{i\\u0307}]", // i with dot 1196 1197 CASE, "[\\u0131]", "[\\u0131]", // dotless i 1198 1199 CASE, "[\\u0390]", "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]", 1200 1201 CASE, "[\\u03c2]", "[\\u03a3\\u03c2\\u03c3]", // sigmas 1202 1203 CASE, "[\\u03f2]", "[\\u03f2\\u03f9]", // lunate sigmas 1204 1205 CASE, "[\\u03f7]", "[\\u03f7\\u03f8]", 1206 1207 CASE, "[\\u1fe3]", "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]", 1208 1209 CASE, "[\\ufb05]", "[\\ufb05\\ufb06{st}]", 1210 CASE, "[{st}]", "[\\ufb05\\ufb06{st}]", 1211 1212 CASE, "[\\U0001044F]", "[\\U00010427\\U0001044F]", 1213 1214 CASE, "[{a\\u02BE}]", "[\\u1E9A{a\\u02BE}]", // first in sorted table 1215 1216 CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table 1217 1218 CASE_MAPPINGS, 1219 "[aq\\u00DF{Bc}{bC}{Fi}]", 1220 "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]", 1221 1222 CASE_MAPPINGS, 1223 "[\\u01F1]", // 'DZ' 1224 "[\\u01F1\\u01F2\\u01F3]", 1225 1226 CASE_MAPPINGS, 1227 "[a-z]", 1228 "[A-Za-z]", 1229 1230 NULL 1231 }; 1232 1233 UnicodeSet s; 1234 UnicodeSet t; 1235 UnicodeString buf; 1236 for (int32_t i=0; DATA[i]!=NULL; i+=3) { 1237 int32_t selector = DATA[i][0]; 1238 UnicodeString pat(DATA[i+1], -1, US_INV); 1239 UnicodeString exp(DATA[i+2], -1, US_INV); 1240 s.applyPattern(pat, ec); 1241 s.closeOver(selector); 1242 t.applyPattern(exp, ec); 1243 if (U_FAILURE(ec)) { 1244 errln("FAIL: applyPattern failed"); 1245 continue; 1246 } 1247 if (s == t) { 1248 logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp); 1249 } else { 1250 dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " + 1251 s.toPattern(buf, TRUE) + ", expected " + exp); 1252 } 1253 } 1254 1255 #if 0 1256 /* 1257 * Unused test code. 1258 * This was used to compare the old implementation (using USET_CASE) 1259 * with the new one (using 0x100 temporarily) 1260 * while transitioning from hardcoded case closure tables in uniset.cpp 1261 * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu. 1262 * and using ucase.c functions for closure. 1263 * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file 1264 * 1265 * Note: The old and new implementation never fully matched because 1266 * the old implementation turned out to not map U+0130 and U+0131 correctly 1267 * (dotted I and dotless i) and because the old implementation's data tables 1268 * were outdated compared to Unicode 4.0.1 at the time of the change to the 1269 * new implementation. (So sigmas and some other characters were not handled 1270 * according to the newer Unicode version.) 1271 */ 1272 UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2; 1273 UnicodeSetIterator si(sens); 1274 UnicodeString str, buf2; 1275 const UnicodeString *pStr; 1276 UChar32 c; 1277 while(si.next()) { 1278 if(!si.isString()) { 1279 c=si.getCodepoint(); 1280 s.clear(); 1281 s.add(c); 1282 1283 str.setTo(c); 1284 str.foldCase(); 1285 sens2.add(str); 1286 1287 t=s; 1288 s.closeOver(USET_CASE); 1289 t.closeOver(0x100); 1290 if(s!=t) { 1291 errln("FAIL: closeOver(U+%04x) differs: ", c); 1292 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE)); 1293 } 1294 } 1295 } 1296 // remove all code points 1297 // should contain all full case folding mapping strings 1298 sens2.remove(0, 0x10ffff); 1299 si.reset(sens2); 1300 while(si.next()) { 1301 if(si.isString()) { 1302 pStr=&si.getString(); 1303 s.clear(); 1304 s.add(*pStr); 1305 t=s2=s; 1306 s.closeOver(USET_CASE); 1307 t.closeOver(0x100); 1308 if(s!=t) { 1309 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: "); 1310 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE)); 1311 } 1312 } 1313 } 1314 #endif 1315 1316 // Test the pattern API 1317 s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec); 1318 if (U_FAILURE(ec)) { 1319 errln("FAIL: applyPattern failed"); 1320 } else { 1321 expectContainment(s, "abcABC", "defDEF"); 1322 } 1323 UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec); 1324 if (U_FAILURE(ec)) { 1325 errln("FAIL: constructor failed"); 1326 } else { 1327 expectContainment(v, "defDEF", "abcABC"); 1328 } 1329 UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec); 1330 if (U_FAILURE(ec)) { 1331 errln("FAIL: construct w/case mappings failed"); 1332 } else { 1333 expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A")); 1334 } 1335 } 1336 1337 void UnicodeSetTest::TestEscapePattern() { 1338 const char pattern[] = 1339 "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]"; 1340 const char exp[] = 1341 "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]"; 1342 // We test this with two passes; in the second pass we 1343 // pre-unescape the pattern. Since U+200E is rule whitespace, 1344 // this fails -- which is what we expect. 1345 for (int32_t pass=1; pass<=2; ++pass) { 1346 UErrorCode ec = U_ZERO_ERROR; 1347 UnicodeString pat(pattern, -1, US_INV); 1348 if (pass==2) { 1349 pat = pat.unescape(); 1350 } 1351 // Pattern is only good for pass 1 1352 UBool isPatternValid = (pass==1); 1353 1354 UnicodeSet set(pat, ec); 1355 if (U_SUCCESS(ec) != isPatternValid){ 1356 errln((UnicodeString)"FAIL: applyPattern(" + 1357 escape(pat) + ") => " + 1358 u_errorName(ec)); 1359 continue; 1360 } 1361 if (U_FAILURE(ec)) { 1362 continue; 1363 } 1364 if (set.contains((UChar)0x0644)){ 1365 errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)"); 1366 } 1367 1368 UnicodeString newpat; 1369 set.toPattern(newpat, TRUE); 1370 if (newpat == UnicodeString(exp, -1, US_INV)) { 1371 logln(escape(pat) + " => " + newpat); 1372 } else { 1373 errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat); 1374 } 1375 1376 for (int32_t i=0; i<set.getRangeCount(); ++i) { 1377 UnicodeString str("Range "); 1378 str.append((UChar)(0x30 + i)) 1379 .append(": ") 1380 .append((UChar32)set.getRangeStart(i)) 1381 .append(" - ") 1382 .append((UChar32)set.getRangeEnd(i)); 1383 str = str + " (" + set.getRangeStart(i) + " - " + 1384 set.getRangeEnd(i) + ")"; 1385 if (set.getRangeStart(i) < 0) { 1386 errln((UnicodeString)"FAIL: " + escape(str)); 1387 } else { 1388 logln(escape(str)); 1389 } 1390 } 1391 } 1392 } 1393 1394 void UnicodeSetTest::expectRange(const UnicodeString& label, 1395 const UnicodeSet& set, 1396 UChar32 start, UChar32 end) { 1397 UnicodeSet exp(start, end); 1398 UnicodeString pat; 1399 if (set == exp) { 1400 logln(label + " => " + set.toPattern(pat, TRUE)); 1401 } else { 1402 UnicodeString xpat; 1403 errln((UnicodeString)"FAIL: " + label + " => " + 1404 set.toPattern(pat, TRUE) + 1405 ", expected " + exp.toPattern(xpat, TRUE)); 1406 } 1407 } 1408 1409 void UnicodeSetTest::TestInvalidCodePoint() { 1410 1411 const UChar32 DATA[] = { 1412 // Test range Expected range 1413 0, 0x10FFFF, 0, 0x10FFFF, 1414 (UChar32)-1, 8, 0, 8, 1415 8, 0x110000, 8, 0x10FFFF 1416 }; 1417 const int32_t DATA_LENGTH = sizeof(DATA)/sizeof(DATA[0]); 1418 1419 UnicodeString pat; 1420 int32_t i; 1421 1422 for (i=0; i<DATA_LENGTH; i+=4) { 1423 UChar32 start = DATA[i]; 1424 UChar32 end = DATA[i+1]; 1425 UChar32 xstart = DATA[i+2]; 1426 UChar32 xend = DATA[i+3]; 1427 1428 // Try various API using the test code points 1429 1430 UnicodeSet set(start, end); 1431 expectRange((UnicodeString)"ct(" + start + "," + end + ")", 1432 set, xstart, xend); 1433 1434 set.clear(); 1435 set.set(start, end); 1436 expectRange((UnicodeString)"set(" + start + "," + end + ")", 1437 set, xstart, xend); 1438 1439 UBool b = set.contains(start); 1440 b = set.contains(start, end); 1441 b = set.containsNone(start, end); 1442 b = set.containsSome(start, end); 1443 1444 /*int32_t index = set.indexOf(start);*/ 1445 1446 set.clear(); 1447 set.add(start); 1448 set.add(start, end); 1449 expectRange((UnicodeString)"add(" + start + "," + end + ")", 1450 set, xstart, xend); 1451 1452 set.set(0, 0x10FFFF); 1453 set.retain(start, end); 1454 expectRange((UnicodeString)"retain(" + start + "," + end + ")", 1455 set, xstart, xend); 1456 set.retain(start); 1457 1458 set.set(0, 0x10FFFF); 1459 set.remove(start); 1460 set.remove(start, end); 1461 set.complement(); 1462 expectRange((UnicodeString)"!remove(" + start + "," + end + ")", 1463 set, xstart, xend); 1464 1465 set.set(0, 0x10FFFF); 1466 set.complement(start, end); 1467 set.complement(); 1468 expectRange((UnicodeString)"!complement(" + start + "," + end + ")", 1469 set, xstart, xend); 1470 set.complement(start); 1471 } 1472 1473 const UChar32 DATA2[] = { 1474 0, 1475 0x10FFFF, 1476 (UChar32)-1, 1477 0x110000 1478 }; 1479 const int32_t DATA2_LENGTH = sizeof(DATA2)/sizeof(DATA2[0]); 1480 1481 for (i=0; i<DATA2_LENGTH; ++i) { 1482 UChar32 c = DATA2[i], end = 0x10FFFF; 1483 UBool valid = (c >= 0 && c <= 0x10FFFF); 1484 1485 UnicodeSet set(0, 0x10FFFF); 1486 1487 // For single-codepoint contains, invalid codepoints are NOT contained 1488 UBool b = set.contains(c); 1489 if (b == valid) { 1490 logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c + 1491 ") = " + b); 1492 } else { 1493 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c + 1494 ") = " + b); 1495 } 1496 1497 // For codepoint range contains, containsNone, and containsSome, 1498 // invalid or empty (start > end) ranges have UNDEFINED behavior. 1499 b = set.contains(c, end); 1500 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c + 1501 "," + end + ") = " + b); 1502 1503 b = set.containsNone(c, end); 1504 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c + 1505 "," + end + ") = " + b); 1506 1507 b = set.containsSome(c, end); 1508 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c + 1509 "," + end + ") = " + b); 1510 1511 int32_t index = set.indexOf(c); 1512 if ((index >= 0) == valid) { 1513 logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c + 1514 ") = " + index); 1515 } else { 1516 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c + 1517 ") = " + index); 1518 } 1519 } 1520 } 1521 1522 // Used by TestSymbolTable 1523 class TokenSymbolTable : public SymbolTable { 1524 public: 1525 Hashtable contents; 1526 1527 TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) { 1528 contents.setValueDeleter(uhash_deleteUnicodeString); 1529 } 1530 1531 ~TokenSymbolTable() {} 1532 1533 /** 1534 * (Non-SymbolTable API) Add the given variable and value to 1535 * the table. Variable should NOT contain leading '$'. 1536 */ 1537 void add(const UnicodeString& var, const UnicodeString& value, 1538 UErrorCode& ec) { 1539 if (U_SUCCESS(ec)) { 1540 contents.put(var, new UnicodeString(value), ec); 1541 } 1542 } 1543 1544 /** 1545 * SymbolTable API 1546 */ 1547 virtual const UnicodeString* lookup(const UnicodeString& s) const { 1548 return (const UnicodeString*) contents.get(s); 1549 } 1550 1551 /** 1552 * SymbolTable API 1553 */ 1554 virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const { 1555 return NULL; 1556 } 1557 1558 /** 1559 * SymbolTable API 1560 */ 1561 virtual UnicodeString parseReference(const UnicodeString& text, 1562 ParsePosition& pos, int32_t limit) const { 1563 int32_t start = pos.getIndex(); 1564 int32_t i = start; 1565 UnicodeString result; 1566 while (i < limit) { 1567 UChar c = text.charAt(i); 1568 if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) { 1569 break; 1570 } 1571 ++i; 1572 } 1573 if (i == start) { // No valid name chars 1574 return result; // Indicate failure with empty string 1575 } 1576 pos.setIndex(i); 1577 text.extractBetween(start, i, result); 1578 return result; 1579 } 1580 }; 1581 1582 void UnicodeSetTest::TestSymbolTable() { 1583 // Multiple test cases can be set up here. Each test case 1584 // is terminated by null: 1585 // var, value, var, value,..., input pat., exp. output pat., null 1586 const char* DATA[] = { 1587 "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL, 1588 "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL, 1589 "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL, 1590 NULL 1591 }; 1592 1593 for (int32_t i=0; DATA[i]!=NULL; ++i) { 1594 UErrorCode ec = U_ZERO_ERROR; 1595 TokenSymbolTable sym(ec); 1596 if (U_FAILURE(ec)) { 1597 errln("FAIL: couldn't construct TokenSymbolTable"); 1598 continue; 1599 } 1600 1601 // Set up variables 1602 while (DATA[i+2] != NULL) { 1603 sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec); 1604 if (U_FAILURE(ec)) { 1605 errln("FAIL: couldn't add to TokenSymbolTable"); 1606 continue; 1607 } 1608 i += 2; 1609 } 1610 1611 // Input pattern and expected output pattern 1612 UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV); 1613 i += 2; 1614 1615 ParsePosition pos(0); 1616 UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec); 1617 if (U_FAILURE(ec)) { 1618 errln("FAIL: couldn't construct UnicodeSet"); 1619 continue; 1620 } 1621 1622 // results 1623 if (pos.getIndex() != inpat.length()) { 1624 errln((UnicodeString)"Failed to read to end of string \"" 1625 + inpat + "\": read to " 1626 + pos.getIndex() + ", length is " 1627 + inpat.length()); 1628 } 1629 1630 UnicodeSet us2(exppat, ec); 1631 if (U_FAILURE(ec)) { 1632 errln("FAIL: couldn't construct expected UnicodeSet"); 1633 continue; 1634 } 1635 1636 UnicodeString a, b; 1637 if (us != us2) { 1638 errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) + 1639 ", expected " + us2.toPattern(b, TRUE)); 1640 } else { 1641 logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE)); 1642 } 1643 } 1644 } 1645 1646 void UnicodeSetTest::TestSurrogate() { 1647 const char* DATA[] = { 1648 // These should all behave identically 1649 "[abc\\uD800\\uDC00]", 1650 // "[abc\uD800\uDC00]", // Can't do this on C -- only Java 1651 "[abc\\U00010000]", 1652 0 1653 }; 1654 for (int i=0; DATA[i] != 0; ++i) { 1655 UErrorCode ec = U_ZERO_ERROR; 1656 logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV)); 1657 UnicodeString str = UnicodeString(DATA[i], -1, US_INV); 1658 UnicodeSet set(str, ec); 1659 if (U_FAILURE(ec)) { 1660 errln("FAIL: UnicodeSet constructor"); 1661 continue; 1662 } 1663 expectContainment(set, 1664 CharsToUnicodeString("abc\\U00010000"), 1665 CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair 1666 if (set.size() != 4) { 1667 errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " + 1668 set.size() + ", expected 4"); 1669 } 1670 } 1671 } 1672 1673 void UnicodeSetTest::TestExhaustive() { 1674 // exhaustive tests. Simulate UnicodeSets with integers. 1675 // That gives us very solid tests (except for large memory tests). 1676 1677 int32_t limit = 128; 1678 1679 UnicodeSet x, y, z, aa; 1680 1681 for (int32_t i = 0; i < limit; ++i) { 1682 bitsToSet(i, x); 1683 logln((UnicodeString)"Testing " + i + ", " + x); 1684 _testComplement(i, x, y); 1685 1686 // AS LONG AS WE ARE HERE, check roundtrip 1687 checkRoundTrip(bitsToSet(i, aa)); 1688 1689 for (int32_t j = 0; j < limit; ++j) { 1690 _testAdd(i,j, x,y,z); 1691 _testXor(i,j, x,y,z); 1692 _testRetain(i,j, x,y,z); 1693 _testRemove(i,j, x,y,z); 1694 } 1695 } 1696 } 1697 1698 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) { 1699 bitsToSet(a, x); 1700 z = x; 1701 z.complement(); 1702 int32_t c = setToBits(z); 1703 if (c != (~a)) { 1704 errln((UnicodeString)"FAILED: add: ~" + x + " != " + z); 1705 errln((UnicodeString)"FAILED: add: ~" + a + " != " + c); 1706 } 1707 checkCanonicalRep(z, (UnicodeString)"complement " + a); 1708 } 1709 1710 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) { 1711 bitsToSet(a, x); 1712 bitsToSet(b, y); 1713 z = x; 1714 z.addAll(y); 1715 int32_t c = setToBits(z); 1716 if (c != (a | b)) { 1717 errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z); 1718 errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c); 1719 } 1720 checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b); 1721 } 1722 1723 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) { 1724 bitsToSet(a, x); 1725 bitsToSet(b, y); 1726 z = x; 1727 z.retainAll(y); 1728 int32_t c = setToBits(z); 1729 if (c != (a & b)) { 1730 errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z); 1731 errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c); 1732 } 1733 checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b); 1734 } 1735 1736 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) { 1737 bitsToSet(a, x); 1738 bitsToSet(b, y); 1739 z = x; 1740 z.removeAll(y); 1741 int32_t c = setToBits(z); 1742 if (c != (a &~ b)) { 1743 errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z); 1744 errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c); 1745 } 1746 checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b); 1747 } 1748 1749 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) { 1750 bitsToSet(a, x); 1751 bitsToSet(b, y); 1752 z = x; 1753 z.complementAll(y); 1754 int32_t c = setToBits(z); 1755 if (c != (a ^ b)) { 1756 errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z); 1757 errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c); 1758 } 1759 checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b); 1760 } 1761 1762 /** 1763 * Check that ranges are monotonically increasing and non- 1764 * overlapping. 1765 */ 1766 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) { 1767 int32_t n = set.getRangeCount(); 1768 if (n < 0) { 1769 errln((UnicodeString)"FAIL result of " + msg + 1770 ": range count should be >= 0 but is " + 1771 n /*+ " for " + set.toPattern())*/); 1772 return; 1773 } 1774 UChar32 last = 0; 1775 for (int32_t i=0; i<n; ++i) { 1776 UChar32 start = set.getRangeStart(i); 1777 UChar32 end = set.getRangeEnd(i); 1778 if (start > end) { 1779 errln((UnicodeString)"FAIL result of " + msg + 1780 ": range " + (i+1) + 1781 " start > end: " + (int)start + ", " + (int)end + 1782 " for " + set); 1783 } 1784 if (i > 0 && start <= last) { 1785 errln((UnicodeString)"FAIL result of " + msg + 1786 ": range " + (i+1) + 1787 " overlaps previous range: " + (int)start + ", " + (int)end + 1788 " for " + set); 1789 } 1790 last = end; 1791 } 1792 } 1793 1794 /** 1795 * Convert a bitmask to a UnicodeSet. 1796 */ 1797 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) { 1798 result.clear(); 1799 for (UChar32 i = 0; i < 32; ++i) { 1800 if ((a & (1<<i)) != 0) { 1801 result.add(i); 1802 } 1803 } 1804 return result; 1805 } 1806 1807 /** 1808 * Convert a UnicodeSet to a bitmask. Only the characters 1809 * U+0000 to U+0020 are represented in the bitmask. 1810 */ 1811 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) { 1812 int32_t result = 0; 1813 for (int32_t i = 0; i < 32; ++i) { 1814 if (x.contains((UChar32)i)) { 1815 result |= (1<<i); 1816 } 1817 } 1818 return result; 1819 } 1820 1821 /** 1822 * Return the representation of an inversion list based UnicodeSet 1823 * as a pairs list. Ranges are listed in ascending Unicode order. 1824 * For example, the set [a-zA-M3] is represented as "33AMaz". 1825 */ 1826 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) { 1827 UnicodeString pairs; 1828 for (int32_t i=0; i<set.getRangeCount(); ++i) { 1829 UChar32 start = set.getRangeStart(i); 1830 UChar32 end = set.getRangeEnd(i); 1831 if (end > 0xFFFF) { 1832 end = 0xFFFF; 1833 i = set.getRangeCount(); // Should be unnecessary 1834 } 1835 pairs.append((UChar)start).append((UChar)end); 1836 } 1837 return pairs; 1838 } 1839 1840 /** 1841 * Basic consistency check for a few items. 1842 * That the iterator works, and that we can create a pattern and 1843 * get the same thing back 1844 */ 1845 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) { 1846 UErrorCode ec = U_ZERO_ERROR; 1847 1848 UnicodeSet t(s); 1849 checkEqual(s, t, "copy ct"); 1850 1851 t = s; 1852 checkEqual(s, t, "operator="); 1853 1854 copyWithIterator(t, s, FALSE); 1855 checkEqual(s, t, "iterator roundtrip"); 1856 1857 copyWithIterator(t, s, TRUE); // try range 1858 checkEqual(s, t, "iterator roundtrip"); 1859 1860 UnicodeString pat; s.toPattern(pat, FALSE); 1861 t.applyPattern(pat, ec); 1862 if (U_FAILURE(ec)) { 1863 errln("FAIL: applyPattern"); 1864 return; 1865 } else { 1866 checkEqual(s, t, "toPattern(false)"); 1867 } 1868 1869 s.toPattern(pat, TRUE); 1870 t.applyPattern(pat, ec); 1871 if (U_FAILURE(ec)) { 1872 errln("FAIL: applyPattern"); 1873 return; 1874 } else { 1875 checkEqual(s, t, "toPattern(true)"); 1876 } 1877 } 1878 1879 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) { 1880 t.clear(); 1881 UnicodeSetIterator it(s); 1882 if (withRange) { 1883 while (it.nextRange()) { 1884 if (it.isString()) { 1885 t.add(it.getString()); 1886 } else { 1887 t.add(it.getCodepoint(), it.getCodepointEnd()); 1888 } 1889 } 1890 } else { 1891 while (it.next()) { 1892 if (it.isString()) { 1893 t.add(it.getString()); 1894 } else { 1895 t.add(it.getCodepoint()); 1896 } 1897 } 1898 } 1899 } 1900 1901 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) { 1902 UnicodeString source; s.toPattern(source, TRUE); 1903 UnicodeString result; t.toPattern(result, TRUE); 1904 if (s != t) { 1905 errln((UnicodeString)"FAIL: " + message 1906 + "; source = " + source 1907 + "; result = " + result 1908 ); 1909 return FALSE; 1910 } else { 1911 logln((UnicodeString)"Ok: " + message 1912 + "; source = " + source 1913 + "; result = " + result 1914 ); 1915 } 1916 return TRUE; 1917 } 1918 1919 void 1920 UnicodeSetTest::expectContainment(const UnicodeString& pat, 1921 const UnicodeString& charsIn, 1922 const UnicodeString& charsOut) { 1923 UErrorCode ec = U_ZERO_ERROR; 1924 UnicodeSet set(pat, ec); 1925 if (U_FAILURE(ec)) { 1926 dataerrln((UnicodeString)"FAIL: pattern \"" + 1927 pat + "\" => " + u_errorName(ec)); 1928 return; 1929 } 1930 expectContainment(set, pat, charsIn, charsOut); 1931 } 1932 1933 void 1934 UnicodeSetTest::expectContainment(const UnicodeSet& set, 1935 const UnicodeString& charsIn, 1936 const UnicodeString& charsOut) { 1937 UnicodeString pat; 1938 set.toPattern(pat); 1939 expectContainment(set, pat, charsIn, charsOut); 1940 } 1941 1942 void 1943 UnicodeSetTest::expectContainment(const UnicodeSet& set, 1944 const UnicodeString& setName, 1945 const UnicodeString& charsIn, 1946 const UnicodeString& charsOut) { 1947 UnicodeString bad; 1948 UChar32 c; 1949 int32_t i; 1950 1951 for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) { 1952 c = charsIn.char32At(i); 1953 if (!set.contains(c)) { 1954 bad.append(c); 1955 } 1956 } 1957 if (bad.length() > 0) { 1958 errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) + 1959 ", expected containment of " + prettify(charsIn)); 1960 } else { 1961 logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn)); 1962 } 1963 1964 bad.truncate(0); 1965 for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) { 1966 c = charsOut.char32At(i); 1967 if (set.contains(c)) { 1968 bad.append(c); 1969 } 1970 } 1971 if (bad.length() > 0) { 1972 errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) + 1973 ", expected non-containment of " + prettify(charsOut)); 1974 } else { 1975 logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut)); 1976 } 1977 } 1978 1979 void 1980 UnicodeSetTest::expectPattern(UnicodeSet& set, 1981 const UnicodeString& pattern, 1982 const UnicodeString& expectedPairs){ 1983 UErrorCode status = U_ZERO_ERROR; 1984 set.applyPattern(pattern, status); 1985 if (U_FAILURE(status)) { 1986 errln(UnicodeString("FAIL: applyPattern(\"") + pattern + 1987 "\") failed"); 1988 return; 1989 } else { 1990 if (getPairs(set) != expectedPairs ) { 1991 errln(UnicodeString("FAIL: applyPattern(\"") + pattern + 1992 "\") => pairs \"" + 1993 escape(getPairs(set)) + "\", expected \"" + 1994 escape(expectedPairs) + "\""); 1995 } else { 1996 logln(UnicodeString("Ok: applyPattern(\"") + pattern + 1997 "\") => pairs \"" + 1998 escape(getPairs(set)) + "\""); 1999 } 2000 } 2001 // the result of calling set.toPattern(), which is the string representation of 2002 // this set(set), is passed to a UnicodeSet constructor, and tested that it 2003 // will produce another set that is equal to this one. 2004 UnicodeString temppattern; 2005 set.toPattern(temppattern); 2006 UnicodeSet *tempset=new UnicodeSet(temppattern, status); 2007 if (U_FAILURE(status)) { 2008 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern")); 2009 return; 2010 } 2011 if(*tempset != set || getPairs(*tempset) != getPairs(set)){ 2012 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" + 2013 escape(getPairs(set)) + "\"")); 2014 } else{ 2015 logln(UnicodeString("Ok: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\"")); 2016 } 2017 2018 delete tempset; 2019 2020 } 2021 2022 void 2023 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) { 2024 if (getPairs(set) != expectedPairs) { 2025 errln(UnicodeString("FAIL: Expected pair list \"") + 2026 escape(expectedPairs) + "\", got \"" + 2027 escape(getPairs(set)) + "\""); 2028 } 2029 } 2030 2031 void UnicodeSetTest::expectToPattern(const UnicodeSet& set, 2032 const UnicodeString& expPat, 2033 const char** expStrings) { 2034 UnicodeString pat; 2035 set.toPattern(pat, TRUE); 2036 if (pat == expPat) { 2037 logln((UnicodeString)"Ok: toPattern() => \"" + pat + "\""); 2038 } else { 2039 errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\""); 2040 return; 2041 } 2042 if (expStrings == NULL) { 2043 return; 2044 } 2045 UBool in = TRUE; 2046 for (int32_t i=0; expStrings[i] != NULL; ++i) { 2047 if (expStrings[i] == NOT) { // sic; pointer comparison 2048 in = FALSE; 2049 continue; 2050 } 2051 UnicodeString s = CharsToUnicodeString(expStrings[i]); 2052 UBool contained = set.contains(s); 2053 if (contained == in) { 2054 logln((UnicodeString)"Ok: " + expPat + 2055 (contained ? " contains {" : " does not contain {") + 2056 escape(expStrings[i]) + "}"); 2057 } else { 2058 errln((UnicodeString)"FAIL: " + expPat + 2059 (contained ? " contains {" : " does not contain {") + 2060 escape(expStrings[i]) + "}"); 2061 } 2062 } 2063 } 2064 2065 static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); } 2066 2067 void 2068 UnicodeSetTest::doAssert(UBool condition, const char *message) 2069 { 2070 if (!condition) { 2071 errln(UnicodeString("ERROR : ") + message); 2072 } 2073 } 2074 2075 UnicodeString 2076 UnicodeSetTest::escape(const UnicodeString& s) { 2077 UnicodeString buf; 2078 for (int32_t i=0; i<s.length(); ) 2079 { 2080 UChar32 c = s.char32At(i); 2081 if (0x0020 <= c && c <= 0x007F) { 2082 buf += c; 2083 } else { 2084 if (c <= 0xFFFF) { 2085 buf += (UChar)0x5c; buf += (UChar)0x75; 2086 } else { 2087 buf += (UChar)0x5c; buf += (UChar)0x55; 2088 buf += toHexString((c & 0xF0000000) >> 28); 2089 buf += toHexString((c & 0x0F000000) >> 24); 2090 buf += toHexString((c & 0x00F00000) >> 20); 2091 buf += toHexString((c & 0x000F0000) >> 16); 2092 } 2093 buf += toHexString((c & 0xF000) >> 12); 2094 buf += toHexString((c & 0x0F00) >> 8); 2095 buf += toHexString((c & 0x00F0) >> 4); 2096 buf += toHexString(c & 0x000F); 2097 } 2098 i += U16_LENGTH(c); 2099 } 2100 return buf; 2101 } 2102 2103 void UnicodeSetTest::TestFreezable() { 2104 UErrorCode errorCode=U_ZERO_ERROR; 2105 UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15); 2106 UnicodeSet idSet(idPattern, errorCode); 2107 if(U_FAILURE(errorCode)) { 2108 dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode)); 2109 return; 2110 } 2111 2112 UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15); 2113 UnicodeSet wsSet(wsPattern, errorCode); 2114 if(U_FAILURE(errorCode)) { 2115 dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode)); 2116 return; 2117 } 2118 2119 idSet.add(idPattern); 2120 UnicodeSet frozen(idSet); 2121 frozen.freeze(); 2122 2123 if(idSet.isFrozen() || !frozen.isFrozen()) { 2124 errln("FAIL: isFrozen() is wrong"); 2125 } 2126 if(frozen!=idSet || !(frozen==idSet)) { 2127 errln("FAIL: a copy-constructed frozen set differs from its original"); 2128 } 2129 2130 frozen=wsSet; 2131 if(frozen!=idSet || !(frozen==idSet)) { 2132 errln("FAIL: a frozen set was modified by operator="); 2133 } 2134 2135 UnicodeSet frozen2(frozen); 2136 if(frozen2!=frozen || frozen2!=idSet) { 2137 errln("FAIL: a copied frozen set differs from its frozen original"); 2138 } 2139 if(!frozen2.isFrozen()) { 2140 errln("FAIL: copy-constructing a frozen set results in a thawed one"); 2141 } 2142 UnicodeSet frozen3(5, 55); // Set to some values to really test assignment below, not copy construction. 2143 if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) { 2144 errln("FAIL: UnicodeSet(5, 55) failed"); 2145 } 2146 frozen3=frozen; 2147 if(!frozen3.isFrozen()) { 2148 errln("FAIL: copying a frozen set results in a thawed one"); 2149 } 2150 2151 UnicodeSet *cloned=(UnicodeSet *)frozen.clone(); 2152 if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) { 2153 errln("FAIL: clone() failed"); 2154 } 2155 cloned->add(0xd802, 0xd805); 2156 if(cloned->containsSome(0xd802, 0xd805)) { 2157 errln("FAIL: unable to modify clone"); 2158 } 2159 delete cloned; 2160 2161 UnicodeSet *thawed=(UnicodeSet *)frozen.cloneAsThawed(); 2162 if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) { 2163 errln("FAIL: cloneAsThawed() failed"); 2164 } 2165 thawed->add(0xd802, 0xd805); 2166 if(!thawed->contains(0xd802, 0xd805)) { 2167 errln("FAIL: unable to modify thawed clone"); 2168 } 2169 delete thawed; 2170 2171 frozen.set(5, 55); 2172 if(frozen!=idSet || !(frozen==idSet)) { 2173 errln("FAIL: UnicodeSet::set() modified a frozen set"); 2174 } 2175 2176 frozen.clear(); 2177 if(frozen!=idSet || !(frozen==idSet)) { 2178 errln("FAIL: UnicodeSet::clear() modified a frozen set"); 2179 } 2180 2181 frozen.closeOver(USET_CASE_INSENSITIVE); 2182 if(frozen!=idSet || !(frozen==idSet)) { 2183 errln("FAIL: UnicodeSet::closeOver() modified a frozen set"); 2184 } 2185 2186 frozen.compact(); 2187 if(frozen!=idSet || !(frozen==idSet)) { 2188 errln("FAIL: UnicodeSet::compact() modified a frozen set"); 2189 } 2190 2191 ParsePosition pos; 2192 frozen. 2193 applyPattern(wsPattern, errorCode). 2194 applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode). 2195 applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode). 2196 applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode). 2197 applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode); 2198 if(frozen!=idSet || !(frozen==idSet)) { 2199 errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set"); 2200 } 2201 2202 frozen. 2203 add(0xd800). 2204 add(0xd802, 0xd805). 2205 add(wsPattern). 2206 addAll(idPattern). 2207 addAll(wsSet); 2208 if(frozen!=idSet || !(frozen==idSet)) { 2209 errln("FAIL: UnicodeSet::addXYZ() modified a frozen set"); 2210 } 2211 2212 frozen. 2213 retain(0x62). 2214 retain(0x64, 0x69). 2215 retainAll(wsPattern). 2216 retainAll(wsSet); 2217 if(frozen!=idSet || !(frozen==idSet)) { 2218 errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set"); 2219 } 2220 2221 frozen. 2222 remove(0x62). 2223 remove(0x64, 0x69). 2224 remove(idPattern). 2225 removeAll(idPattern). 2226 removeAll(idSet); 2227 if(frozen!=idSet || !(frozen==idSet)) { 2228 errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set"); 2229 } 2230 2231 frozen. 2232 complement(). 2233 complement(0x62). 2234 complement(0x64, 0x69). 2235 complement(idPattern). 2236 complementAll(idPattern). 2237 complementAll(idSet); 2238 if(frozen!=idSet || !(frozen==idSet)) { 2239 errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set"); 2240 } 2241 } 2242 2243 // Test span() etc. -------------------------------------------------------- *** 2244 2245 // Append the UTF-8 version of the string to t and return the appended UTF-8 length. 2246 static int32_t 2247 appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) { 2248 UErrorCode errorCode=U_ZERO_ERROR; 2249 int32_t length8=0; 2250 u_strToUTF8(t, capacity, &length8, s, length, &errorCode); 2251 if(U_SUCCESS(errorCode)) { 2252 return length8; 2253 } else { 2254 // The string contains an unpaired surrogate. 2255 // Ignore this string. 2256 return 0; 2257 } 2258 } 2259 2260 class UnicodeSetWithStringsIterator; 2261 2262 // Make the strings in a UnicodeSet easily accessible. 2263 class UnicodeSetWithStrings { 2264 public: 2265 UnicodeSetWithStrings(const UnicodeSet &normalSet) : 2266 set(normalSet), stringsLength(0), hasSurrogates(FALSE) { 2267 int32_t size=set.size(); 2268 if(size>0 && set.charAt(size-1)<0) { 2269 // If a set's last element is not a code point, then it must contain strings. 2270 // Iterate over the set, skip all code point ranges, and cache the strings. 2271 // Convert them to UTF-8 for spanUTF8(). 2272 UnicodeSetIterator iter(set); 2273 const UnicodeString *s; 2274 char *s8=utf8; 2275 int32_t length8, utf8Count=0; 2276 while(iter.nextRange() && stringsLength<LENGTHOF(strings)) { 2277 if(iter.isString()) { 2278 // Store the pointer to the set's string element 2279 // which we happen to know is a stable pointer. 2280 strings[stringsLength]=s=&iter.getString(); 2281 utf8Count+= 2282 utf8Lengths[stringsLength]=length8= 2283 appendUTF8(s->getBuffer(), s->length(), 2284 s8, (int32_t)(sizeof(utf8)-utf8Count)); 2285 if(length8==0) { 2286 hasSurrogates=TRUE; // Contains unpaired surrogates. 2287 } 2288 s8+=length8; 2289 ++stringsLength; 2290 } 2291 } 2292 } 2293 } 2294 2295 const UnicodeSet &getSet() const { 2296 return set; 2297 } 2298 2299 UBool hasStrings() const { 2300 return (UBool)(stringsLength>0); 2301 } 2302 2303 UBool hasStringsWithSurrogates() const { 2304 return hasSurrogates; 2305 } 2306 2307 private: 2308 friend class UnicodeSetWithStringsIterator; 2309 2310 const UnicodeSet &set; 2311 2312 const UnicodeString *strings[20]; 2313 int32_t stringsLength; 2314 UBool hasSurrogates; 2315 2316 char utf8[1024]; 2317 int32_t utf8Lengths[20]; 2318 2319 int32_t nextStringIndex; 2320 int32_t nextUTF8Start; 2321 }; 2322 2323 class UnicodeSetWithStringsIterator { 2324 public: 2325 UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) : 2326 fSet(set), nextStringIndex(0), nextUTF8Start(0) { 2327 } 2328 2329 void reset() { 2330 nextStringIndex=nextUTF8Start=0; 2331 } 2332 2333 const UnicodeString *nextString() { 2334 if(nextStringIndex<fSet.stringsLength) { 2335 return fSet.strings[nextStringIndex++]; 2336 } else { 2337 return NULL; 2338 } 2339 } 2340 2341 // Do not mix with calls to nextString(). 2342 const char *nextUTF8(int32_t &length) { 2343 if(nextStringIndex<fSet.stringsLength) { 2344 const char *s8=fSet.utf8+nextUTF8Start; 2345 nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++]; 2346 return s8; 2347 } else { 2348 length=0; 2349 return NULL; 2350 } 2351 } 2352 2353 private: 2354 const UnicodeSetWithStrings &fSet; 2355 int32_t nextStringIndex; 2356 int32_t nextUTF8Start; 2357 }; 2358 2359 // Compare 16-bit Unicode strings (which may be malformed UTF-16) 2360 // at code point boundaries. 2361 // That is, each edge of a match must not be in the middle of a surrogate pair. 2362 static inline UBool 2363 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) { 2364 s+=start; 2365 limit-=start; 2366 int32_t length=t.length(); 2367 return 0==t.compare(s, length) && 2368 !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) && 2369 !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length])); 2370 } 2371 2372 // Implement span() with contains() for comparison. 2373 static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length, 2374 USetSpanCondition spanCondition) { 2375 const UnicodeSet &realSet(set.getSet()); 2376 if(!set.hasStrings()) { 2377 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2378 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2379 } 2380 2381 UChar32 c; 2382 int32_t start=0, prev; 2383 while((prev=start)<length) { 2384 U16_NEXT(s, start, length, c); 2385 if(realSet.contains(c)!=spanCondition) { 2386 break; 2387 } 2388 } 2389 return prev; 2390 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) { 2391 UnicodeSetWithStringsIterator iter(set); 2392 UChar32 c; 2393 int32_t start, next; 2394 for(start=next=0; start<length;) { 2395 U16_NEXT(s, next, length, c); 2396 if(realSet.contains(c)) { 2397 break; 2398 } 2399 const UnicodeString *str; 2400 iter.reset(); 2401 while((str=iter.nextString())!=NULL) { 2402 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) { 2403 // spanNeedsStrings=TRUE; 2404 return start; 2405 } 2406 } 2407 start=next; 2408 } 2409 return start; 2410 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ { 2411 UnicodeSetWithStringsIterator iter(set); 2412 UChar32 c; 2413 int32_t start, next, maxSpanLimit=0; 2414 for(start=next=0; start<length;) { 2415 U16_NEXT(s, next, length, c); 2416 if(!realSet.contains(c)) { 2417 next=start; // Do not span this single, not-contained code point. 2418 } 2419 const UnicodeString *str; 2420 iter.reset(); 2421 while((str=iter.nextString())!=NULL) { 2422 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) { 2423 // spanNeedsStrings=TRUE; 2424 int32_t matchLimit=start+str->length(); 2425 if(matchLimit==length) { 2426 return length; 2427 } 2428 if(spanCondition==USET_SPAN_CONTAINED) { 2429 // Iterate for the shortest match at each position. 2430 // Recurse for each but the shortest match. 2431 if(next==start) { 2432 next=matchLimit; // First match from start. 2433 } else { 2434 if(matchLimit<next) { 2435 // Remember shortest match from start for iteration. 2436 int32_t temp=next; 2437 next=matchLimit; 2438 matchLimit=temp; 2439 } 2440 // Recurse for non-shortest match from start. 2441 int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit, 2442 USET_SPAN_CONTAINED); 2443 if((matchLimit+spanLength)>maxSpanLimit) { 2444 maxSpanLimit=matchLimit+spanLength; 2445 if(maxSpanLimit==length) { 2446 return length; 2447 } 2448 } 2449 } 2450 } else /* spanCondition==USET_SPAN_SIMPLE */ { 2451 if(matchLimit>next) { 2452 // Remember longest match from start. 2453 next=matchLimit; 2454 } 2455 } 2456 } 2457 } 2458 if(next==start) { 2459 break; // No match from start. 2460 } 2461 start=next; 2462 } 2463 if(start>maxSpanLimit) { 2464 return start; 2465 } else { 2466 return maxSpanLimit; 2467 } 2468 } 2469 } 2470 2471 static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length, 2472 USetSpanCondition spanCondition) { 2473 if(length==0) { 2474 return 0; 2475 } 2476 const UnicodeSet &realSet(set.getSet()); 2477 if(!set.hasStrings()) { 2478 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2479 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2480 } 2481 2482 UChar32 c; 2483 int32_t prev=length; 2484 do { 2485 U16_PREV(s, 0, length, c); 2486 if(realSet.contains(c)!=spanCondition) { 2487 break; 2488 } 2489 } while((prev=length)>0); 2490 return prev; 2491 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) { 2492 UnicodeSetWithStringsIterator iter(set); 2493 UChar32 c; 2494 int32_t prev=length, length0=length; 2495 do { 2496 U16_PREV(s, 0, length, c); 2497 if(realSet.contains(c)) { 2498 break; 2499 } 2500 const UnicodeString *str; 2501 iter.reset(); 2502 while((str=iter.nextString())!=NULL) { 2503 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) { 2504 // spanNeedsStrings=TRUE; 2505 return prev; 2506 } 2507 } 2508 } while((prev=length)>0); 2509 return prev; 2510 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ { 2511 UnicodeSetWithStringsIterator iter(set); 2512 UChar32 c; 2513 int32_t prev=length, minSpanStart=length, length0=length; 2514 do { 2515 U16_PREV(s, 0, length, c); 2516 if(!realSet.contains(c)) { 2517 length=prev; // Do not span this single, not-contained code point. 2518 } 2519 const UnicodeString *str; 2520 iter.reset(); 2521 while((str=iter.nextString())!=NULL) { 2522 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) { 2523 // spanNeedsStrings=TRUE; 2524 int32_t matchStart=prev-str->length(); 2525 if(matchStart==0) { 2526 return 0; 2527 } 2528 if(spanCondition==USET_SPAN_CONTAINED) { 2529 // Iterate for the shortest match at each position. 2530 // Recurse for each but the shortest match. 2531 if(length==prev) { 2532 length=matchStart; // First match from prev. 2533 } else { 2534 if(matchStart>length) { 2535 // Remember shortest match from prev for iteration. 2536 int32_t temp=length; 2537 length=matchStart; 2538 matchStart=temp; 2539 } 2540 // Recurse for non-shortest match from prev. 2541 int32_t spanStart=containsSpanBackUTF16(set, s, matchStart, 2542 USET_SPAN_CONTAINED); 2543 if(spanStart<minSpanStart) { 2544 minSpanStart=spanStart; 2545 if(minSpanStart==0) { 2546 return 0; 2547 } 2548 } 2549 } 2550 } else /* spanCondition==USET_SPAN_SIMPLE */ { 2551 if(matchStart<length) { 2552 // Remember longest match from prev. 2553 length=matchStart; 2554 } 2555 } 2556 } 2557 } 2558 if(length==prev) { 2559 break; // No match from prev. 2560 } 2561 } while((prev=length)>0); 2562 if(prev<minSpanStart) { 2563 return prev; 2564 } else { 2565 return minSpanStart; 2566 } 2567 } 2568 } 2569 2570 static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length, 2571 USetSpanCondition spanCondition) { 2572 const UnicodeSet &realSet(set.getSet()); 2573 if(!set.hasStrings()) { 2574 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2575 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2576 } 2577 2578 UChar32 c; 2579 int32_t start=0, prev; 2580 while((prev=start)<length) { 2581 U8_NEXT(s, start, length, c); 2582 if(c<0) { 2583 c=0xfffd; 2584 } 2585 if(realSet.contains(c)!=spanCondition) { 2586 break; 2587 } 2588 } 2589 return prev; 2590 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) { 2591 UnicodeSetWithStringsIterator iter(set); 2592 UChar32 c; 2593 int32_t start, next; 2594 for(start=next=0; start<length;) { 2595 U8_NEXT(s, next, length, c); 2596 if(c<0) { 2597 c=0xfffd; 2598 } 2599 if(realSet.contains(c)) { 2600 break; 2601 } 2602 const char *s8; 2603 int32_t length8; 2604 iter.reset(); 2605 while((s8=iter.nextUTF8(length8))!=NULL) { 2606 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) { 2607 // spanNeedsStrings=TRUE; 2608 return start; 2609 } 2610 } 2611 start=next; 2612 } 2613 return start; 2614 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ { 2615 UnicodeSetWithStringsIterator iter(set); 2616 UChar32 c; 2617 int32_t start, next, maxSpanLimit=0; 2618 for(start=next=0; start<length;) { 2619 U8_NEXT(s, next, length, c); 2620 if(c<0) { 2621 c=0xfffd; 2622 } 2623 if(!realSet.contains(c)) { 2624 next=start; // Do not span this single, not-contained code point. 2625 } 2626 const char *s8; 2627 int32_t length8; 2628 iter.reset(); 2629 while((s8=iter.nextUTF8(length8))!=NULL) { 2630 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) { 2631 // spanNeedsStrings=TRUE; 2632 int32_t matchLimit=start+length8; 2633 if(matchLimit==length) { 2634 return length; 2635 } 2636 if(spanCondition==USET_SPAN_CONTAINED) { 2637 // Iterate for the shortest match at each position. 2638 // Recurse for each but the shortest match. 2639 if(next==start) { 2640 next=matchLimit; // First match from start. 2641 } else { 2642 if(matchLimit<next) { 2643 // Remember shortest match from start for iteration. 2644 int32_t temp=next; 2645 next=matchLimit; 2646 matchLimit=temp; 2647 } 2648 // Recurse for non-shortest match from start. 2649 int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit, 2650 USET_SPAN_CONTAINED); 2651 if((matchLimit+spanLength)>maxSpanLimit) { 2652 maxSpanLimit=matchLimit+spanLength; 2653 if(maxSpanLimit==length) { 2654 return length; 2655 } 2656 } 2657 } 2658 } else /* spanCondition==USET_SPAN_SIMPLE */ { 2659 if(matchLimit>next) { 2660 // Remember longest match from start. 2661 next=matchLimit; 2662 } 2663 } 2664 } 2665 } 2666 if(next==start) { 2667 break; // No match from start. 2668 } 2669 start=next; 2670 } 2671 if(start>maxSpanLimit) { 2672 return start; 2673 } else { 2674 return maxSpanLimit; 2675 } 2676 } 2677 } 2678 2679 static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length, 2680 USetSpanCondition spanCondition) { 2681 if(length==0) { 2682 return 0; 2683 } 2684 const UnicodeSet &realSet(set.getSet()); 2685 if(!set.hasStrings()) { 2686 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2687 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2688 } 2689 2690 UChar32 c; 2691 int32_t prev=length; 2692 do { 2693 U8_PREV(s, 0, length, c); 2694 if(c<0) { 2695 c=0xfffd; 2696 } 2697 if(realSet.contains(c)!=spanCondition) { 2698 break; 2699 } 2700 } while((prev=length)>0); 2701 return prev; 2702 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) { 2703 UnicodeSetWithStringsIterator iter(set); 2704 UChar32 c; 2705 int32_t prev=length; 2706 do { 2707 U8_PREV(s, 0, length, c); 2708 if(c<0) { 2709 c=0xfffd; 2710 } 2711 if(realSet.contains(c)) { 2712 break; 2713 } 2714 const char *s8; 2715 int32_t length8; 2716 iter.reset(); 2717 while((s8=iter.nextUTF8(length8))!=NULL) { 2718 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) { 2719 // spanNeedsStrings=TRUE; 2720 return prev; 2721 } 2722 } 2723 } while((prev=length)>0); 2724 return prev; 2725 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ { 2726 UnicodeSetWithStringsIterator iter(set); 2727 UChar32 c; 2728 int32_t prev=length, minSpanStart=length; 2729 do { 2730 U8_PREV(s, 0, length, c); 2731 if(c<0) { 2732 c=0xfffd; 2733 } 2734 if(!realSet.contains(c)) { 2735 length=prev; // Do not span this single, not-contained code point. 2736 } 2737 const char *s8; 2738 int32_t length8; 2739 iter.reset(); 2740 while((s8=iter.nextUTF8(length8))!=NULL) { 2741 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) { 2742 // spanNeedsStrings=TRUE; 2743 int32_t matchStart=prev-length8; 2744 if(matchStart==0) { 2745 return 0; 2746 } 2747 if(spanCondition==USET_SPAN_CONTAINED) { 2748 // Iterate for the shortest match at each position. 2749 // Recurse for each but the shortest match. 2750 if(length==prev) { 2751 length=matchStart; // First match from prev. 2752 } else { 2753 if(matchStart>length) { 2754 // Remember shortest match from prev for iteration. 2755 int32_t temp=length; 2756 length=matchStart; 2757 matchStart=temp; 2758 } 2759 // Recurse for non-shortest match from prev. 2760 int32_t spanStart=containsSpanBackUTF8(set, s, matchStart, 2761 USET_SPAN_CONTAINED); 2762 if(spanStart<minSpanStart) { 2763 minSpanStart=spanStart; 2764 if(minSpanStart==0) { 2765 return 0; 2766 } 2767 } 2768 } 2769 } else /* spanCondition==USET_SPAN_SIMPLE */ { 2770 if(matchStart<length) { 2771 // Remember longest match from prev. 2772 length=matchStart; 2773 } 2774 } 2775 } 2776 } 2777 if(length==prev) { 2778 break; // No match from prev. 2779 } 2780 } while((prev=length)>0); 2781 if(prev<minSpanStart) { 2782 return prev; 2783 } else { 2784 return minSpanStart; 2785 } 2786 } 2787 } 2788 2789 // spans to be performed and compared 2790 enum { 2791 SPAN_UTF16 =1, 2792 SPAN_UTF8 =2, 2793 SPAN_UTFS =3, 2794 2795 SPAN_SET =4, 2796 SPAN_COMPLEMENT =8, 2797 SPAN_POLARITY =0xc, 2798 2799 SPAN_FWD =0x10, 2800 SPAN_BACK =0x20, 2801 SPAN_DIRS =0x30, 2802 2803 SPAN_CONTAINED =0x100, 2804 SPAN_SIMPLE =0x200, 2805 SPAN_CONDITION =0x300, 2806 2807 SPAN_ALL =0x33f 2808 }; 2809 2810 static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) { 2811 return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED; 2812 } 2813 2814 static inline int32_t slen(const void *s, UBool isUTF16) { 2815 return isUTF16 ? u_strlen((const UChar *)s) : strlen((const char *)s); 2816 } 2817 2818 /* 2819 * Count spans on a string with the method according to type and set the span limits. 2820 * The set may be the complement of the original. 2821 * When using spanBack() and comparing with span(), use a span condition for the first spanBack() 2822 * according to the expected number of spans. 2823 * Sets typeName to an empty string if there is no such type. 2824 * Returns -1 if the span option is filtered out. 2825 */ 2826 static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement, 2827 const void *s, int32_t length, UBool isUTF16, 2828 uint32_t whichSpans, 2829 int type, const char *&typeName, 2830 int32_t limits[], int32_t limitsCapacity, 2831 int32_t expectCount) { 2832 const UnicodeSet &realSet(set.getSet()); 2833 int32_t start, count; 2834 USetSpanCondition spanCondition, firstSpanCondition, contained; 2835 UBool isForward; 2836 2837 if(type<0 || 7<type) { 2838 typeName=""; 2839 return 0; 2840 } 2841 2842 static const char *const typeNames16[]={ 2843 "contains", "contains(LM)", 2844 "span", "span(LM)", 2845 "containsBack", "containsBack(LM)", 2846 "spanBack", "spanBack(LM)" 2847 }; 2848 2849 static const char *const typeNames8[]={ 2850 "containsUTF8", "containsUTF8(LM)", 2851 "spanUTF8", "spanUTF8(LM)", 2852 "containsBackUTF8", "containsBackUTF8(LM)", // not implemented 2853 "spanBackUTF8", "spanBackUTF8(LM)" 2854 }; 2855 2856 typeName= isUTF16 ? typeNames16[type] : typeNames8[type]; 2857 2858 // filter span options 2859 if(type<=3) { 2860 // span forward 2861 if((whichSpans&SPAN_FWD)==0) { 2862 return -1; 2863 } 2864 isForward=TRUE; 2865 } else { 2866 // span backward 2867 if((whichSpans&SPAN_BACK)==0) { 2868 return -1; 2869 } 2870 isForward=FALSE; 2871 } 2872 if((type&1)==0) { 2873 // use USET_SPAN_CONTAINED 2874 if((whichSpans&SPAN_CONTAINED)==0) { 2875 return -1; 2876 } 2877 contained=USET_SPAN_CONTAINED; 2878 } else { 2879 // use USET_SPAN_SIMPLE 2880 if((whichSpans&SPAN_SIMPLE)==0) { 2881 return -1; 2882 } 2883 contained=USET_SPAN_SIMPLE; 2884 } 2885 2886 // Default first span condition for going forward with an uncomplemented set. 2887 spanCondition=USET_SPAN_NOT_CONTAINED; 2888 if(isComplement) { 2889 spanCondition=invertSpanCondition(spanCondition, contained); 2890 } 2891 2892 // First span condition for span(), used to terminate the spanBack() iteration. 2893 firstSpanCondition=spanCondition; 2894 2895 // spanBack(): Its initial span condition is span()'s last span condition, 2896 // which is the opposite of span()'s first span condition 2897 // if we expect an even number of spans. 2898 // (The loop inverts spanCondition (expectCount-1) times 2899 // before the expectCount'th span() call.) 2900 // If we do not compare forward and backward directions, then we do not have an 2901 // expectCount and just start with firstSpanCondition. 2902 if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) { 2903 spanCondition=invertSpanCondition(spanCondition, contained); 2904 } 2905 2906 count=0; 2907 switch(type) { 2908 case 0: 2909 case 1: 2910 start=0; 2911 if(length<0) { 2912 length=slen(s, isUTF16); 2913 } 2914 for(;;) { 2915 start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) : 2916 containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition); 2917 if(count<limitsCapacity) { 2918 limits[count]=start; 2919 } 2920 ++count; 2921 if(start>=length) { 2922 break; 2923 } 2924 spanCondition=invertSpanCondition(spanCondition, contained); 2925 } 2926 break; 2927 case 2: 2928 case 3: 2929 start=0; 2930 for(;;) { 2931 start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) : 2932 realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition); 2933 if(count<limitsCapacity) { 2934 limits[count]=start; 2935 } 2936 ++count; 2937 if(length>=0 ? start>=length : 2938 isUTF16 ? ((const UChar *)s)[start]==0 : 2939 ((const char *)s)[start]==0 2940 ) { 2941 break; 2942 } 2943 spanCondition=invertSpanCondition(spanCondition, contained); 2944 } 2945 break; 2946 case 4: 2947 case 5: 2948 if(length<0) { 2949 length=slen(s, isUTF16); 2950 } 2951 for(;;) { 2952 ++count; 2953 if(count<=limitsCapacity) { 2954 limits[limitsCapacity-count]=length; 2955 } 2956 length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) : 2957 containsSpanBackUTF8(set, (const char *)s, length, spanCondition); 2958 if(length==0 && spanCondition==firstSpanCondition) { 2959 break; 2960 } 2961 spanCondition=invertSpanCondition(spanCondition, contained); 2962 } 2963 if(count<limitsCapacity) { 2964 memmove(limits, limits+(limitsCapacity-count), count*4); 2965 } 2966 break; 2967 case 6: 2968 case 7: 2969 for(;;) { 2970 ++count; 2971 if(count<=limitsCapacity) { 2972 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16); 2973 } 2974 // Note: Length<0 is tested only for the first spanBack(). 2975 // If we wanted to keep length<0 for all spanBack()s, we would have to 2976 // temporarily modify the string by placing a NUL where the previous spanBack() stopped. 2977 length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) : 2978 realSet.spanBackUTF8((const char *)s, length, spanCondition); 2979 if(length==0 && spanCondition==firstSpanCondition) { 2980 break; 2981 } 2982 spanCondition=invertSpanCondition(spanCondition, contained); 2983 } 2984 if(count<limitsCapacity) { 2985 memmove(limits, limits+(limitsCapacity-count), count*4); 2986 } 2987 break; 2988 default: 2989 typeName=""; 2990 return -1; 2991 } 2992 2993 return count; 2994 } 2995 2996 // sets to be tested; odd index=isComplement 2997 enum { 2998 SLOW, 2999 SLOW_NOT, 3000 FAST, 3001 FAST_NOT, 3002 SET_COUNT 3003 }; 3004 3005 static const char *const setNames[SET_COUNT]={ 3006 "slow", 3007 "slow.not", 3008 "fast", 3009 "fast.not" 3010 }; 3011 3012 /* 3013 * Verify that we get the same results whether we look at text with contains(), 3014 * span() or spanBack(), using unfrozen or frozen versions of the set, 3015 * and using the set or its complement (switching the spanConditions accordingly). 3016 * The latter verifies that 3017 * set.span(spanCondition) == set.complement().span(!spanCondition). 3018 * 3019 * The expectLimits[] are either provided by the caller (with expectCount>=0) 3020 * or returned to the caller (with an input expectCount<0). 3021 */ 3022 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4], 3023 const void *s, int32_t length, UBool isUTF16, 3024 uint32_t whichSpans, 3025 int32_t expectLimits[], int32_t &expectCount, 3026 const char *testName, int32_t index) { 3027 int32_t limits[500]; 3028 int32_t limitsCount; 3029 int i, j; 3030 3031 const char *typeName; 3032 int type; 3033 3034 for(i=0; i<SET_COUNT; ++i) { 3035 if((i&1)==0) { 3036 // Even-numbered sets are original, uncomplemented sets. 3037 if((whichSpans&SPAN_SET)==0) { 3038 continue; 3039 } 3040 } else { 3041 // Odd-numbered sets are complemented. 3042 if((whichSpans&SPAN_COMPLEMENT)==0) { 3043 continue; 3044 } 3045 } 3046 for(type=0;; ++type) { 3047 limitsCount=getSpans(*sets[i], (UBool)(i&1), 3048 s, length, isUTF16, 3049 whichSpans, 3050 type, typeName, 3051 limits, LENGTHOF(limits), expectCount); 3052 if(typeName[0]==0) { 3053 break; // All types tried. 3054 } 3055 if(limitsCount<0) { 3056 continue; // Span option filtered out. 3057 } 3058 if(expectCount<0) { 3059 expectCount=limitsCount; 3060 if(limitsCount>LENGTHOF(limits)) { 3061 errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans", 3062 testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)LENGTHOF(limits)); 3063 return; 3064 } 3065 memcpy(expectLimits, limits, limitsCount*4); 3066 } else if(limitsCount!=expectCount) { 3067 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld", 3068 testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount); 3069 } else { 3070 for(j=0; j<limitsCount; ++j) { 3071 if(limits[j]!=expectLimits[j]) { 3072 errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld", 3073 testName, (long)index, setNames[i], typeName, (long)limitsCount, 3074 j, (long)limits[j], (long)expectLimits[j]); 3075 break; 3076 } 3077 } 3078 } 3079 } 3080 } 3081 3082 // Compare span() with containsAll()/containsNone(), 3083 // but only if we have expectLimits[] from the uncomplemented set. 3084 if(isUTF16 && (whichSpans&SPAN_SET)!=0) { 3085 const UChar *s16=(const UChar *)s; 3086 UnicodeString string; 3087 int32_t prev=0, limit, length; 3088 for(i=0; i<expectCount; ++i) { 3089 limit=expectLimits[i]; 3090 length=limit-prev; 3091 if(length>0) { 3092 string.setTo(FALSE, s16+prev, length); // read-only alias 3093 if(i&1) { 3094 if(!sets[SLOW]->getSet().containsAll(string)) { 3095 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()", 3096 testName, (long)index, setNames[SLOW], (long)prev, (long)limit); 3097 return; 3098 } 3099 if(!sets[FAST]->getSet().containsAll(string)) { 3100 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()", 3101 testName, (long)index, setNames[FAST], (long)prev, (long)limit); 3102 return; 3103 } 3104 } else { 3105 if(!sets[SLOW]->getSet().containsNone(string)) { 3106 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()", 3107 testName, (long)index, setNames[SLOW], (long)prev, (long)limit); 3108 return; 3109 } 3110 if(!sets[FAST]->getSet().containsNone(string)) { 3111 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()", 3112 testName, (long)index, setNames[FAST], (long)prev, (long)limit); 3113 return; 3114 } 3115 } 3116 } 3117 prev=limit; 3118 } 3119 } 3120 } 3121 3122 // Specifically test either UTF-16 or UTF-8. 3123 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4], 3124 const void *s, int32_t length, UBool isUTF16, 3125 uint32_t whichSpans, 3126 const char *testName, int32_t index) { 3127 int32_t expectLimits[500]; 3128 int32_t expectCount=-1; 3129 testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index); 3130 } 3131 3132 UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) { 3133 UChar c, c2; 3134 3135 if(length>=0) { 3136 while(length>0) { 3137 c=*s++; 3138 --length; 3139 if(0xd800<=c && c<0xe000) { 3140 if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) { 3141 return TRUE; 3142 } 3143 --length; 3144 } 3145 } 3146 } else { 3147 while((c=*s++)!=0) { 3148 if(0xd800<=c && c<0xe000) { 3149 if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) { 3150 return TRUE; 3151 } 3152 } 3153 } 3154 } 3155 return FALSE; 3156 } 3157 3158 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text, 3159 // unless either UTF is turned off in whichSpans. 3160 // Testing UTF-16 and UTF-8 together requires that surrogate code points 3161 // have the same contains(c) value as U+FFFD. 3162 void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4], 3163 const UChar *s16, int32_t length16, 3164 uint32_t whichSpans, 3165 const char *testName, int32_t index) { 3166 int32_t expectLimits[500]; 3167 int32_t expectCount; 3168 3169 expectCount=-1; // Get expectLimits[] from testSpan(). 3170 3171 if((whichSpans&SPAN_UTF16)!=0) { 3172 testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index); 3173 } 3174 if((whichSpans&SPAN_UTF8)==0) { 3175 return; 3176 } 3177 3178 // Convert s16[] and expectLimits[] to UTF-8. 3179 uint8_t s8[3000]; 3180 int32_t offsets[3000]; 3181 3182 const UChar *s16Limit=s16+length16; 3183 char *t=(char *)s8; 3184 char *tLimit=t+sizeof(s8); 3185 int32_t *o=offsets; 3186 UErrorCode errorCode=U_ZERO_ERROR; 3187 3188 // Convert with substitution: Turn unpaired surrogates into U+FFFD. 3189 ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode); 3190 if(U_FAILURE(errorCode)) { 3191 errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s", 3192 testName, (long)index, u_errorName(errorCode)); 3193 ucnv_resetFromUnicode(utf8Cnv); 3194 return; 3195 } 3196 int32_t length8=(int32_t)(t-(char *)s8); 3197 3198 // Convert expectLimits[]. 3199 int32_t i, j, expect; 3200 for(i=j=0; i<expectCount; ++i) { 3201 expect=expectLimits[i]; 3202 if(expect==length16) { 3203 expectLimits[i]=length8; 3204 } else { 3205 while(offsets[j]<expect) { 3206 ++j; 3207 } 3208 expectLimits[i]=j; 3209 } 3210 } 3211 3212 testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index); 3213 } 3214 3215 static UChar32 nextCodePoint(UChar32 c) { 3216 // Skip some large and boring ranges. 3217 switch(c) { 3218 case 0x3441: 3219 return 0x4d7f; 3220 case 0x5100: 3221 return 0x9f00; 3222 case 0xb040: 3223 return 0xd780; 3224 case 0xe041: 3225 return 0xf8fe; 3226 case 0x10100: 3227 return 0x20000; 3228 case 0x20041: 3229 return 0xe0000; 3230 case 0xe0101: 3231 return 0x10fffd; 3232 default: 3233 return c+1; 3234 } 3235 } 3236 3237 // Verify that all implementations represent the same set. 3238 void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) { 3239 // contains(U+FFFD) is inconsistent with contains(some surrogates), 3240 // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8: 3241 // Skip the UTF-8 part of the test - if the string contains surrogates - 3242 // because it is likely to produce a different result. 3243 UBool inconsistentSurrogates= 3244 (!(sets[0]->getSet().contains(0xfffd) ? 3245 sets[0]->getSet().contains(0xd800, 0xdfff) : 3246 sets[0]->getSet().containsNone(0xd800, 0xdfff)) || 3247 sets[0]->hasStringsWithSurrogates()); 3248 3249 UChar s[1000]; 3250 int32_t length=0; 3251 uint32_t localWhichSpans; 3252 3253 UChar32 c, first; 3254 for(first=c=0;; c=nextCodePoint(c)) { 3255 if(c>0x10ffff || length>(LENGTHOF(s)-U16_MAX_LENGTH)) { 3256 localWhichSpans=whichSpans; 3257 if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) { 3258 localWhichSpans&=~SPAN_UTF8; 3259 } 3260 testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first); 3261 if(c>0x10ffff) { 3262 break; 3263 } 3264 length=0; 3265 first=c; 3266 } 3267 U16_APPEND_UNSAFE(s, length, c); 3268 } 3269 } 3270 3271 // Test with a particular, interesting string. 3272 // Specify length and try NUL-termination. 3273 void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) { 3274 static const UChar s[]={ 3275 0x61, 0x62, 0x20, // Latin, space 3276 0x3b1, 0x3b2, 0x3b3, // Greek 3277 0xd900, // lead surrogate 3278 0x3000, 0x30ab, 0x30ad, // wide space, Katakana 3279 0xdc05, // trail surrogate 3280 0xa0, 0xac00, 0xd7a3, // nbsp, Hangul 3281 0xd900, 0xdc05, // unassigned supplementary 3282 0xd840, 0xdfff, 0xd860, 0xdffe, // Han supplementary 3283 0xd7a4, 0xdc05, 0xd900, 0x2028, // unassigned, surrogates in wrong order, LS 3284 0 // NUL 3285 }; 3286 3287 if((whichSpans&SPAN_UTF16)==0) { 3288 return; 3289 } 3290 testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0); 3291 testSpan(sets, s, LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1); 3292 } 3293 3294 void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) { 3295 static const char s[]={ 3296 "abc" // Latin 3297 3298 /* trail byte in lead position */ 3299 "\x80" 3300 3301 " " // space 3302 3303 /* truncated multi-byte sequences */ 3304 "\xd0" 3305 "\xe0" 3306 "\xe1" 3307 "\xed" 3308 "\xee" 3309 "\xf0" 3310 "\xf1" 3311 "\xf4" 3312 "\xf8" 3313 "\xfc" 3314 3315 "\xCE\xB1\xCE\xB2\xCE\xB3" // Greek 3316 3317 /* trail byte in lead position */ 3318 "\x80" 3319 3320 "\xe0\x80" 3321 "\xe0\xa0" 3322 "\xe1\x80" 3323 "\xed\x80" 3324 "\xed\xa0" 3325 "\xee\x80" 3326 "\xf0\x80" 3327 "\xf0\x90" 3328 "\xf1\x80" 3329 "\xf4\x80" 3330 "\xf4\x90" 3331 "\xf8\x80" 3332 "\xfc\x80" 3333 3334 "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD" // wide space, Katakana 3335 3336 /* trail byte in lead position */ 3337 "\x80" 3338 3339 "\xf0\x80\x80" 3340 "\xf0\x90\x80" 3341 "\xf1\x80\x80" 3342 "\xf4\x80\x80" 3343 "\xf4\x90\x80" 3344 "\xf8\x80\x80" 3345 "\xfc\x80\x80" 3346 3347 "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3" // nbsp, Hangul 3348 3349 /* trail byte in lead position */ 3350 "\x80" 3351 3352 "\xf8\x80\x80\x80" 3353 "\xfc\x80\x80\x80" 3354 3355 "\xF1\x90\x80\x85" // unassigned supplementary 3356 3357 /* trail byte in lead position */ 3358 "\x80" 3359 3360 "\xfc\x80\x80\x80\x80" 3361 3362 "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE" // Han supplementary 3363 3364 /* trail byte in lead position */ 3365 "\x80" 3366 3367 /* complete sequences but non-shortest forms or out of range etc. */ 3368 "\xc0\x80" 3369 "\xe0\x80\x80" 3370 "\xed\xa0\x80" 3371 "\xf0\x80\x80\x80" 3372 "\xf4\x90\x80\x80" 3373 "\xf8\x80\x80\x80\x80" 3374 "\xfc\x80\x80\x80\x80\x80" 3375 "\xfe" 3376 "\xff" 3377 3378 /* trail byte in lead position */ 3379 "\x80" 3380 3381 "\xED\x9E\xA4\xE2\x80\xA8" // unassigned, LS, NUL-terminated 3382 }; 3383 3384 if((whichSpans&SPAN_UTF8)==0) { 3385 return; 3386 } 3387 testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0); 3388 testSpan(sets, s, LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1); 3389 } 3390 3391 // Take a set of span options and multiply them so that 3392 // each portion only has one of the options a, b and c. 3393 // If b==0, then the set of options is just modified with mask and a. 3394 // If b!=0 and c==0, then the set of options is just modified with mask, a and b. 3395 static int32_t 3396 addAlternative(uint32_t whichSpans[], int32_t whichSpansCount, 3397 uint32_t mask, uint32_t a, uint32_t b, uint32_t c) { 3398 uint32_t s; 3399 int32_t i; 3400 3401 for(i=0; i<whichSpansCount; ++i) { 3402 s=whichSpans[i]&mask; 3403 whichSpans[i]=s|a; 3404 if(b!=0) { 3405 whichSpans[whichSpansCount+i]=s|b; 3406 if(c!=0) { 3407 whichSpans[2*whichSpansCount+i]=s|c; 3408 } 3409 } 3410 } 3411 return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount; 3412 } 3413 3414 #define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" 3415 #define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" 3416 #define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" 3417 #define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" 3418 3419 void UnicodeSetTest::TestSpan() { 3420 // "[...]" is a UnicodeSet pattern. 3421 // "*" performs tests on all Unicode code points and on a selection of 3422 // malformed UTF-8/16 strings. 3423 // "-options" limits the scope of testing for the current set. 3424 // By default, the test verifies that equivalent boundaries are found 3425 // for UTF-16 and UTF-8, going forward and backward, 3426 // alternating USET_SPAN_NOT_CONTAINED with 3427 // either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE. 3428 // Single-character options: 3429 // 8 -- UTF-16 and UTF-8 boundaries may differ. 3430 // Cause: contains(U+FFFD) is inconsistent with contains(some surrogates), 3431 // or the set contains strings with unpaired surrogates 3432 // which do not translate to valid UTF-8. 3433 // c -- set.span() and set.complement().span() boundaries may differ. 3434 // Cause: Set strings are not complemented. 3435 // b -- span() and spanBack() boundaries may differ. 3436 // Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED) 3437 // and spanBack(USET_SPAN_SIMPLE) are defined to 3438 // match with non-overlapping substrings. 3439 // For example, with a set containing "ab" and "ba", 3440 // span() of "aba" yields boundaries { 0, 2, 3 } 3441 // because the initial "ab" matches from 0 to 2, 3442 // while spanBack() yields boundaries { 0, 1, 3 } 3443 // because the final "ba" matches from 1 to 3. 3444 // l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ. 3445 // Cause: Strings in the set overlap, and a longer match may 3446 // require a sequence including non-longest substrings. 3447 // For example, with a set containing "ab", "abc" and "cd", 3448 // span(contained) of "abcd" spans the entire string 3449 // but span(longest match) only spans the first 3 characters. 3450 // Each "-options" first resets all options and then applies the specified options. 3451 // A "-" without options resets the options. 3452 // The options are also reset for each new set. 3453 // Other strings will be spanned. 3454 static const char *const testdata[]={ 3455 "[:ID_Continue:]", 3456 "*", 3457 "[:White_Space:]", 3458 "*", 3459 "[]", 3460 "*", 3461 "[\\u0000-\\U0010FFFF]", 3462 "*", 3463 "[\\u0000\\u0080\\u0800\\U00010000]", 3464 "*", 3465 "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]", 3466 "*", 3467 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]", 3468 "-c", 3469 "*", 3470 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]", 3471 "-c", 3472 "*", 3473 3474 // Overlapping strings cause overlapping attempts to match. 3475 "[x{xy}{xya}{axy}{ax}]", 3476 "-cl", 3477 3478 // More repetitions of "xya" would take too long with the recursive 3479 // reference implementation. 3480 // containsAll()=FALSE 3481 // test_string 0x14 3482 "xx" 3483 "xyaxyaxyaxya" // set.complement().span(longest match) will stop here. 3484 "xx" // set.complement().span(contained) will stop between the two 'x'es. 3485 "xyaxyaxyaxya" 3486 "xx" 3487 "xyaxyaxyaxya" // span() ends here. 3488 "aaa", 3489 3490 // containsAll()=TRUE 3491 // test_string 0x15 3492 "xx" 3493 "xyaxyaxyaxya" 3494 "xx" 3495 "xyaxyaxyaxya" 3496 "xx" 3497 "xyaxyaxyaxy", 3498 3499 "-bc", 3500 // test_string 0x17 3501 "byayaxya", // span() -> { 4, 7, 8 } spanBack() -> { 5, 8 } 3502 "-c", 3503 "byayaxy", // span() -> { 4, 7 } complement.span() -> { 7 } 3504 "byayax", // span() -> { 4, 6 } complement.span() -> { 6 } 3505 "-", 3506 "byaya", // span() -> { 5 } 3507 "byay", // span() -> { 4 } 3508 "bya", // span() -> { 3 } 3509 3510 // span(longest match) will not span the whole string. 3511 "[a{ab}{bc}]", 3512 "-cl", 3513 // test_string 0x21 3514 "abc", 3515 3516 "[a{ab}{abc}{cd}]", 3517 "-cl", 3518 "acdabcdabccd", 3519 3520 // spanBack(longest match) will not span the whole string. 3521 "[c{ab}{bc}]", 3522 "-cl", 3523 "abc", 3524 3525 "[d{cd}{bcd}{ab}]", 3526 "-cl", 3527 "abbcdabcdabd", 3528 3529 // Test with non-ASCII set strings - test proper handling of surrogate pairs 3530 // and UTF-8 trail bytes. 3531 // Copies of above test sets and strings, but transliterated to have 3532 // different code points with similar trail units. 3533 // Previous: a b c d 3534 // Unicode: 042B 30AB 200AB 204AB 3535 // UTF-16: 042B 30AB D840 DCAB D841 DCAB 3536 // UTF-8: D0 AB E3 82 AB F0 A0 82 AB F0 A0 92 AB 3537 "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]", 3538 "-cl", 3539 "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB", 3540 3541 "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]", 3542 "-cl", 3543 "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB", 3544 3545 // Stress bookkeeping and recursion. 3546 // The following strings are barely doable with the recursive 3547 // reference implementation. 3548 // The not-contained character at the end prevents an early exit from the span(). 3549 "[b{bb}]", 3550 "-c", 3551 // test_string 0x33 3552 "bbbbbbbbbbbbbbbbbbbbbbbb-", 3553 // On complement sets, span() and spanBack() get different results 3554 // because b is not in the complement set and there is an odd number of b's 3555 // in the test string. 3556 "-bc", 3557 "bbbbbbbbbbbbbbbbbbbbbbbbb-", 3558 3559 // Test with set strings with an initial or final code point span 3560 // longer than 254. 3561 "[a{" _64_a _64_a _64_a _64_a "b}" 3562 "{a" _64_b _64_b _64_b _64_b "}]", 3563 "-c", 3564 _64_a _64_a _64_a _63_a "b", 3565 _64_a _64_a _64_a _64_a "b", 3566 _64_a _64_a _64_a _64_a "aaaabbbb", 3567 "a" _64_b _64_b _64_b _63_b, 3568 "a" _64_b _64_b _64_b _64_b, 3569 "aaaabbbb" _64_b _64_b _64_b _64_b, 3570 3571 // Test with strings containing unpaired surrogates. 3572 // They are not representable in UTF-8, and a leading trail surrogate 3573 // and a trailing lead surrogate must not match in the middle of a proper surrogate pair. 3574 // U+20001 == \\uD840\\uDC01 3575 // U+20400 == \\uD841\\uDC00 3576 "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]", 3577 "-8cl", 3578 "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb" 3579 }; 3580 uint32_t whichSpans[96]={ SPAN_ALL }; 3581 int32_t whichSpansCount=1; 3582 3583 UnicodeSet *sets[SET_COUNT]={ NULL }; 3584 const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL }; 3585 3586 char testName[1024]; 3587 char *testNameLimit=testName; 3588 3589 int32_t i, j; 3590 for(i=0; i<LENGTHOF(testdata); ++i) { 3591 const char *s=testdata[i]; 3592 if(s[0]=='[') { 3593 // Create new test sets from this pattern. 3594 for(j=0; j<SET_COUNT; ++j) { 3595 delete sets_with_str[j]; 3596 delete sets[j]; 3597 } 3598 UErrorCode errorCode=U_ZERO_ERROR; 3599 sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode); 3600 if(U_FAILURE(errorCode)) { 3601 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode)); 3602 break; 3603 } 3604 sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]); 3605 sets[SLOW_NOT]->complement(); 3606 // Intermediate set: Test cloning of a frozen set. 3607 UnicodeSet *fast=new UnicodeSet(*sets[SLOW]); 3608 fast->freeze(); 3609 sets[FAST]=(UnicodeSet *)fast->clone(); 3610 delete fast; 3611 UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]); 3612 fastNot->freeze(); 3613 sets[FAST_NOT]=(UnicodeSet *)fastNot->clone(); 3614 delete fastNot; 3615 3616 for(j=0; j<SET_COUNT; ++j) { 3617 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]); 3618 } 3619 3620 strcpy(testName, s); 3621 testNameLimit=strchr(testName, 0); 3622 *testNameLimit++=':'; 3623 *testNameLimit=0; 3624 3625 whichSpans[0]=SPAN_ALL; 3626 whichSpansCount=1; 3627 } else if(s[0]=='-') { 3628 whichSpans[0]=SPAN_ALL; 3629 whichSpansCount=1; 3630 3631 while(*++s!=0) { 3632 switch(*s) { 3633 case 'c': 3634 whichSpansCount=addAlternative(whichSpans, whichSpansCount, 3635 ~SPAN_POLARITY, 3636 SPAN_SET, 3637 SPAN_COMPLEMENT, 3638 0); 3639 break; 3640 case 'b': 3641 whichSpansCount=addAlternative(whichSpans, whichSpansCount, 3642 ~SPAN_DIRS, 3643 SPAN_FWD, 3644 SPAN_BACK, 3645 0); 3646 break; 3647 case 'l': 3648 // test USET_SPAN_CONTAINED FWD & BACK, and separately 3649 // USET_SPAN_SIMPLE only FWD, and separately 3650 // USET_SPAN_SIMPLE only BACK 3651 whichSpansCount=addAlternative(whichSpans, whichSpansCount, 3652 ~(SPAN_DIRS|SPAN_CONDITION), 3653 SPAN_DIRS|SPAN_CONTAINED, 3654 SPAN_FWD|SPAN_SIMPLE, 3655 SPAN_BACK|SPAN_SIMPLE); 3656 break; 3657 case '8': 3658 whichSpansCount=addAlternative(whichSpans, whichSpansCount, 3659 ~SPAN_UTFS, 3660 SPAN_UTF16, 3661 SPAN_UTF8, 3662 0); 3663 break; 3664 default: 3665 errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]); 3666 break; 3667 } 3668 } 3669 } else if(0==strcmp(s, "*")) { 3670 strcpy(testNameLimit, "bad_string"); 3671 for(j=0; j<whichSpansCount; ++j) { 3672 if(whichSpansCount>1) { 3673 sprintf(testNameLimit+10 /* strlen("bad_string") */, 3674 "%%0x%3x", 3675 whichSpans[j]); 3676 } 3677 testSpanUTF16String(sets_with_str, whichSpans[j], testName); 3678 testSpanUTF8String(sets_with_str, whichSpans[j], testName); 3679 } 3680 3681 strcpy(testNameLimit, "contents"); 3682 for(j=0; j<whichSpansCount; ++j) { 3683 if(whichSpansCount>1) { 3684 sprintf(testNameLimit+8 /* strlen("contents") */, 3685 "%%0x%3x", 3686 whichSpans[j]); 3687 } 3688 testSpanContents(sets_with_str, whichSpans[j], testName); 3689 } 3690 } else { 3691 UnicodeString string=UnicodeString(s, -1, US_INV).unescape(); 3692 strcpy(testNameLimit, "test_string"); 3693 for(j=0; j<whichSpansCount; ++j) { 3694 if(whichSpansCount>1) { 3695 sprintf(testNameLimit+11 /* strlen("test_string") */, 3696 "%%0x%3x", 3697 whichSpans[j]); 3698 } 3699 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i); 3700 } 3701 } 3702 } 3703 for(j=0; j<SET_COUNT; ++j) { 3704 delete sets_with_str[j]; 3705 delete sets[j]; 3706 } 3707 } 3708 3709 // Test select patterns and strings, and test USET_SPAN_SIMPLE. 3710 void UnicodeSetTest::TestStringSpan() { 3711 static const char *pattern="[x{xy}{xya}{axy}{ax}]"; 3712 static const char *const string= 3713 "xx" 3714 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya" 3715 "xx" 3716 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya" 3717 "xx" 3718 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy" 3719 "aaaa"; 3720 3721 UErrorCode errorCode=U_ZERO_ERROR; 3722 UnicodeString pattern16=UnicodeString(pattern, -1, US_INV); 3723 UnicodeSet set(pattern16, errorCode); 3724 if(U_FAILURE(errorCode)) { 3725 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode)); 3726 return; 3727 } 3728 3729 UnicodeString string16=UnicodeString(string, -1, US_INV).unescape(); 3730 3731 if(set.containsAll(string16)) { 3732 errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string); 3733 } 3734 3735 // Remove trailing "aaaa". 3736 string16.truncate(string16.length()-4); 3737 if(!set.containsAll(string16)) { 3738 errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string); 3739 } 3740 3741 string16=UNICODE_STRING_SIMPLE("byayaxya"); 3742 const UChar *s16=string16.getBuffer(); 3743 int32_t length16=string16.length(); 3744 if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 || 3745 set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 || 3746 set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 || 3747 set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 || 3748 set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 || 3749 set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3 3750 ) { 3751 errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern); 3752 } 3753 3754 pattern="[a{ab}{abc}{cd}]"; 3755 pattern16=UnicodeString(pattern, -1, US_INV); 3756 set.applyPattern(pattern16, errorCode); 3757 if(U_FAILURE(errorCode)) { 3758 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode)); 3759 return; 3760 } 3761 string16=UNICODE_STRING_SIMPLE("acdabcdabccd"); 3762 s16=string16.getBuffer(); 3763 length16=string16.length(); 3764 if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 || 3765 set.span(s16, 12, USET_SPAN_SIMPLE)!=6 || 3766 set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5 3767 ) { 3768 errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern); 3769 } 3770 3771 pattern="[d{cd}{bcd}{ab}]"; 3772 pattern16=UnicodeString(pattern, -1, US_INV); 3773 set.applyPattern(pattern16, errorCode).freeze(); 3774 if(U_FAILURE(errorCode)) { 3775 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode)); 3776 return; 3777 } 3778 string16=UNICODE_STRING_SIMPLE("abbcdabcdabd"); 3779 s16=string16.getBuffer(); 3780 length16=string16.length(); 3781 if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 || 3782 set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 || 3783 set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0 3784 ) { 3785 errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern); 3786 } 3787 } 3788