1 /* 2 ******************************************************************************** 3 * Copyright (C) 1999-2010 International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************************** 6 * Date Name Description 7 * 10/20/99 alan Creation. 8 * 03/22/2000 Madhu Added additional tests 9 ******************************************************************************** 10 */ 11 12 #include <stdio.h> 13 14 #include <string.h> 15 #include "unicode/utypes.h" 16 #include "usettest.h" 17 #include "unicode/ucnv.h" 18 #include "unicode/uniset.h" 19 #include "unicode/uchar.h" 20 #include "unicode/usetiter.h" 21 #include "unicode/ustring.h" 22 #include "unicode/parsepos.h" 23 #include "unicode/symtable.h" 24 #include "unicode/uversion.h" 25 #include "hash.h" 26 27 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 28 29 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \ 30 dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \ 31 u_errorName(status));}} 32 33 #define TEST_ASSERT(expr) {if (!(expr)) { \ 34 dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); }} 35 36 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) { 37 UnicodeString pat; 38 set.toPattern(pat); 39 return left + UnicodeSetTest::escape(pat); 40 } 41 42 #define CASE(id,test) case id: \ 43 name = #test; \ 44 if (exec) { \ 45 logln(#test "---"); \ 46 logln(); \ 47 test(); \ 48 } \ 49 break 50 51 UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) { 52 } 53 54 UConverter *UnicodeSetTest::openUTF8Converter() { 55 if(utf8Cnv==NULL) { 56 UErrorCode errorCode=U_ZERO_ERROR; 57 utf8Cnv=ucnv_open("UTF-8", &errorCode); 58 } 59 return utf8Cnv; 60 } 61 62 UnicodeSetTest::~UnicodeSetTest() { 63 ucnv_close(utf8Cnv); 64 } 65 66 void 67 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec, 68 const char* &name, char* /*par*/) { 69 // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest"); 70 switch (index) { 71 CASE(0,TestPatterns); 72 CASE(1,TestAddRemove); 73 CASE(2,TestCategories); 74 CASE(3,TestCloneEqualHash); 75 CASE(4,TestMinimalRep); 76 CASE(5,TestAPI); 77 CASE(6,TestScriptSet); 78 CASE(7,TestPropertySet); 79 CASE(8,TestClone); 80 CASE(9,TestExhaustive); 81 CASE(10,TestToPattern); 82 CASE(11,TestIndexOf); 83 CASE(12,TestStrings); 84 CASE(13,Testj2268); 85 CASE(14,TestCloseOver); 86 CASE(15,TestEscapePattern); 87 CASE(16,TestInvalidCodePoint); 88 CASE(17,TestSymbolTable); 89 CASE(18,TestSurrogate); 90 CASE(19,TestPosixClasses); 91 CASE(20,TestIteration); 92 CASE(21,TestFreezable); 93 CASE(22,TestSpan); 94 CASE(23,TestStringSpan); 95 default: name = ""; break; 96 } 97 } 98 99 static const char NOT[] = "%%%%"; 100 101 /** 102 * UVector was improperly copying contents 103 * This code will crash this is still true 104 */ 105 void UnicodeSetTest::Testj2268() { 106 UnicodeSet t; 107 t.add(UnicodeString("abc")); 108 UnicodeSet test(t); 109 UnicodeString ustrPat; 110 test.toPattern(ustrPat, TRUE); 111 } 112 113 /** 114 * Test toPattern(). 115 */ 116 void UnicodeSetTest::TestToPattern() { 117 UErrorCode ec = U_ZERO_ERROR; 118 119 // Test that toPattern() round trips with syntax characters and 120 // whitespace. 121 { 122 static const char* OTHER_TOPATTERN_TESTS[] = { 123 "[[:latin:]&[:greek:]]", 124 "[[:latin:]-[:greek:]]", 125 "[:nonspacing mark:]", 126 NULL 127 }; 128 129 for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) { 130 ec = U_ZERO_ERROR; 131 UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec); 132 if (U_FAILURE(ec)) { 133 dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec))); 134 continue; 135 } 136 checkPat(OTHER_TOPATTERN_TESTS[j], s); 137 } 138 139 for (UChar32 i = 0; i <= 0x10FFFF; ++i) { 140 if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) { 141 142 // check various combinations to make sure they all work. 143 if (i != 0 && !toPatternAux(i, i)){ 144 continue; 145 } 146 if (!toPatternAux(0, i)){ 147 continue; 148 } 149 if (!toPatternAux(i, 0xFFFF)){ 150 continue; 151 } 152 } 153 } 154 } 155 156 // Test pattern behavior of multicharacter strings. 157 { 158 ec = U_ZERO_ERROR; 159 UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec); 160 161 // This loop isn't a loop. It's here to make the compiler happy. 162 // If you're curious, try removing it and changing the 'break' 163 // statements (except for the last) to goto's. 164 for (;;) { 165 if (U_FAILURE(ec)) break; 166 const char* exp1[] = {"aa", "ab", NOT, "ac", NULL}; 167 expectToPattern(*s, "[a-z{aa}{ab}]", exp1); 168 169 s->add("ac"); 170 const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL}; 171 expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2); 172 173 s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec); 174 if (U_FAILURE(ec)) break; 175 const char* exp3[] = {"{l", "r}", NOT, "xy", NULL}; 176 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3); 177 178 s->add("[]"); 179 const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL}; 180 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4); 181 182 s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec); 183 if (U_FAILURE(ec)) break; 184 const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL}; 185 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5); 186 187 // j2189 188 s->clear(); 189 s->add(UnicodeString("abc", "")); 190 s->add(UnicodeString("abc", "")); 191 const char* exp6[] = {"abc", NOT, "ab", NULL}; 192 expectToPattern(*s, "[{abc}]", exp6); 193 194 break; 195 } 196 197 if (U_FAILURE(ec)) errln("FAIL: pattern parse error"); 198 delete s; 199 } 200 201 // JB#3400: For 2 character ranges prefer [ab] to [a-b] 202 UnicodeSet s; 203 s.add((UChar)97, (UChar)98); // 'a', 'b' 204 expectToPattern(s, "[ab]", NULL); 205 } 206 207 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) { 208 209 // use Integer.toString because Utility.hex doesn't handle ints 210 UnicodeString pat = ""; 211 // TODO do these in hex 212 //String source = "0x" + Integer.toString(start,16).toUpperCase(); 213 //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase(); 214 UnicodeString source; 215 source = source + (uint32_t)start; 216 if (start != end) 217 source = source + ".." + (uint32_t)end; 218 UnicodeSet testSet; 219 testSet.add(start, end); 220 return checkPat(source, testSet); 221 } 222 223 UBool UnicodeSetTest::checkPat(const UnicodeString& source, 224 const UnicodeSet& testSet) { 225 // What we want to make sure of is that a pattern generated 226 // by toPattern(), with or without escaped unprintables, can 227 // be passed back into the UnicodeSet constructor. 228 UnicodeString pat0; 229 230 testSet.toPattern(pat0, TRUE); 231 232 if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE; 233 234 //String pat1 = unescapeLeniently(pat0); 235 //if (!checkPat(source + " (in code)", testSet, pat1)) return false; 236 237 UnicodeString pat2; 238 testSet.toPattern(pat2, FALSE); 239 if (!checkPat(source, testSet, pat2)) return FALSE; 240 241 //String pat3 = unescapeLeniently(pat2); 242 // if (!checkPat(source + " (in code)", testSet, pat3)) return false; 243 244 //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3); 245 logln((UnicodeString)source + " => " + pat0 + ", " + pat2); 246 return TRUE; 247 } 248 249 UBool UnicodeSetTest::checkPat(const UnicodeString& source, 250 const UnicodeSet& testSet, 251 const UnicodeString& pat) { 252 UErrorCode ec = U_ZERO_ERROR; 253 UnicodeSet testSet2(pat, ec); 254 if (testSet2 != testSet) { 255 errln((UnicodeString)"Fail toPattern: " + source + " => " + pat); 256 return FALSE; 257 } 258 return TRUE; 259 } 260 261 void 262 UnicodeSetTest::TestPatterns(void) { 263 UnicodeSet set; 264 expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""), "km"); 265 expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""), "aczz"); 266 expectPattern(set, UnicodeString("[a\\-z]", ""), "--aazz"); 267 expectPattern(set, UnicodeString("[-az]", ""), "--aazz"); 268 expectPattern(set, UnicodeString("[az-]", ""), "--aazz"); 269 expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz"); 270 271 // Throw in a test of complement 272 set.complement(); 273 UnicodeString exp; 274 exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF); 275 expectPairs(set, exp); 276 } 277 278 void 279 UnicodeSetTest::TestCategories(void) { 280 UErrorCode status = U_ZERO_ERROR; 281 const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:] 282 UnicodeSet set(pat, status); 283 if (U_FAILURE(status)) { 284 dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status))); 285 return; 286 } else { 287 expectContainment(set, pat, "ABC", "abc"); 288 } 289 290 UChar32 i; 291 int32_t failures = 0; 292 // Make sure generation of L doesn't pollute cached Lu set 293 // First generate L, then Lu 294 set.applyPattern("[:L:]", status); 295 if (U_FAILURE(status)) { errln("FAIL"); return; } 296 for (i=0; i<0x200; ++i) { 297 UBool l = u_isalpha((UChar)i); 298 if (l != set.contains(i)) { 299 errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " + 300 set.contains(i)); 301 if (++failures == 10) break; 302 } 303 } 304 305 set.applyPattern("[:Lu:]", status); 306 if (U_FAILURE(status)) { errln("FAIL"); return; } 307 for (i=0; i<0x200; ++i) { 308 UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER); 309 if (lu != set.contains(i)) { 310 errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " + 311 set.contains(i)); 312 if (++failures == 20) break; 313 } 314 } 315 } 316 void 317 UnicodeSetTest::TestCloneEqualHash(void) { 318 UErrorCode status = U_ZERO_ERROR; 319 // set1 and set2 used to be built with the obsolete constructor taking 320 // UCharCategory values; replaced with pattern constructors 321 // markus 20030502 322 UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status); // :Ll: Letter, lowercase 323 UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); // Letter, lowercase 324 if (U_FAILURE(status)){ 325 dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status))); 326 return; 327 } 328 UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status); //Number, Decimal digit 329 UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status); //Number, Decimal digit 330 if (U_FAILURE(status)){ 331 errln((UnicodeString)"FAIL: Can't construct set with category->Nd"); 332 return; 333 } 334 335 if (*set1 != *set1a) { 336 errln("FAIL: category constructor for Ll broken"); 337 } 338 if (*set2 != *set2a) { 339 errln("FAIL: category constructor for Nd broken"); 340 } 341 delete set1a; 342 delete set2a; 343 344 logln("Testing copy construction"); 345 UnicodeSet *set1copy=new UnicodeSet(*set1); 346 if(*set1 != *set1copy || *set1 == *set2 || 347 getPairs(*set1) != getPairs(*set1copy) || 348 set1->hashCode() != set1copy->hashCode()){ 349 errln("FAIL : Error in copy construction"); 350 return; 351 } 352 353 logln("Testing =operator"); 354 UnicodeSet set1equal=*set1; 355 UnicodeSet set2equal=*set2; 356 if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 || 357 set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){ 358 errln("FAIL: Error in =operator"); 359 } 360 361 logln("Testing clone()"); 362 UnicodeSet *set1clone=(UnicodeSet*)set1->clone(); 363 UnicodeSet *set2clone=(UnicodeSet*)set2->clone(); 364 if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal || 365 *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal || 366 *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){ 367 errln("FAIL: Error in clone"); 368 } 369 370 logln("Testing hashcode"); 371 if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() || 372 set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() || 373 set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() || 374 set1->hashCode() == set2->hashCode() || set1copy->hashCode() == set2->hashCode() || 375 set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){ 376 errln("FAIL: Error in hashCode()"); 377 } 378 379 delete set1; 380 delete set1copy; 381 delete set2; 382 delete set1clone; 383 delete set2clone; 384 385 386 } 387 void 388 UnicodeSetTest::TestAddRemove(void) { 389 UnicodeSet set; // Construct empty set 390 doAssert(set.isEmpty() == TRUE, "set should be empty"); 391 doAssert(set.size() == 0, "size should be 0"); 392 set.complement(); 393 doAssert(set.size() == 0x110000, "size should be 0x110000"); 394 set.clear(); 395 set.add(0x0061, 0x007a); 396 expectPairs(set, "az"); 397 doAssert(set.isEmpty() == FALSE, "set should not be empty"); 398 doAssert(set.size() != 0, "size should not be equal to 0"); 399 doAssert(set.size() == 26, "size should be equal to 26"); 400 set.remove(0x006d, 0x0070); 401 expectPairs(set, "alqz"); 402 doAssert(set.size() == 22, "size should be equal to 22"); 403 set.remove(0x0065, 0x0067); 404 expectPairs(set, "adhlqz"); 405 doAssert(set.size() == 19, "size should be equal to 19"); 406 set.remove(0x0064, 0x0069); 407 expectPairs(set, "acjlqz"); 408 doAssert(set.size() == 16, "size should be equal to 16"); 409 set.remove(0x0063, 0x0072); 410 expectPairs(set, "absz"); 411 doAssert(set.size() == 10, "size should be equal to 10"); 412 set.add(0x0066, 0x0071); 413 expectPairs(set, "abfqsz"); 414 doAssert(set.size() == 22, "size should be equal to 22"); 415 set.remove(0x0061, 0x0067); 416 expectPairs(set, "hqsz"); 417 set.remove(0x0061, 0x007a); 418 expectPairs(set, ""); 419 doAssert(set.isEmpty() == TRUE, "set should be empty"); 420 doAssert(set.size() == 0, "size should be 0"); 421 set.add(0x0061); 422 doAssert(set.isEmpty() == FALSE, "set should not be empty"); 423 doAssert(set.size() == 1, "size should not be equal to 1"); 424 set.add(0x0062); 425 set.add(0x0063); 426 expectPairs(set, "ac"); 427 doAssert(set.size() == 3, "size should not be equal to 3"); 428 set.add(0x0070); 429 set.add(0x0071); 430 expectPairs(set, "acpq"); 431 doAssert(set.size() == 5, "size should not be equal to 5"); 432 set.clear(); 433 expectPairs(set, ""); 434 doAssert(set.isEmpty() == TRUE, "set should be empty"); 435 doAssert(set.size() == 0, "size should be 0"); 436 437 // Try removing an entire set from another set 438 expectPattern(set, "[c-x]", "cx"); 439 UnicodeSet set2; 440 expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz"); 441 set.removeAll(set2); 442 expectPairs(set, "deluxx"); 443 444 // Try adding an entire set to another set 445 expectPattern(set, "[jackiemclean]", "aacceein"); 446 expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort"); 447 set.addAll(set2); 448 expectPairs(set, "aacehort"); 449 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2"); 450 451 // Try retaining an set of elements contained in another set (intersection) 452 UnicodeSet set3; 453 expectPattern(set3, "[a-c]", "ac"); 454 doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3"); 455 set3.remove(0x0062); 456 expectPairs(set3, "aacc"); 457 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3"); 458 set.retainAll(set3); 459 expectPairs(set, "aacc"); 460 doAssert(set.size() == set3.size(), "set.size() should be set3.size()"); 461 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3"); 462 set.clear(); 463 doAssert(set.size() != set3.size(), "set.size() != set3.size()"); 464 465 // Test commutativity 466 expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort"); 467 expectPattern(set2, "[jackiemclean]", "aacceein"); 468 set.addAll(set2); 469 expectPairs(set, "aacehort"); 470 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2"); 471 472 473 474 475 } 476 477 /** 478 * Make sure minimal representation is maintained. 479 */ 480 void UnicodeSetTest::TestMinimalRep() { 481 UErrorCode status = U_ZERO_ERROR; 482 // This is pretty thoroughly tested by checkCanonicalRep() 483 // run against the exhaustive operation results. Use the code 484 // here for debugging specific spot problems. 485 486 // 1 overlap against 2 487 UnicodeSet set("[h-km-q]", status); 488 if (U_FAILURE(status)) { errln("FAIL"); return; } 489 UnicodeSet set2("[i-o]", status); 490 if (U_FAILURE(status)) { errln("FAIL"); return; } 491 set.addAll(set2); 492 expectPairs(set, "hq"); 493 // right 494 set.applyPattern("[a-m]", status); 495 if (U_FAILURE(status)) { errln("FAIL"); return; } 496 set2.applyPattern("[e-o]", status); 497 if (U_FAILURE(status)) { errln("FAIL"); return; } 498 set.addAll(set2); 499 expectPairs(set, "ao"); 500 // left 501 set.applyPattern("[e-o]", status); 502 if (U_FAILURE(status)) { errln("FAIL"); return; } 503 set2.applyPattern("[a-m]", status); 504 if (U_FAILURE(status)) { errln("FAIL"); return; } 505 set.addAll(set2); 506 expectPairs(set, "ao"); 507 // 1 overlap against 3 508 set.applyPattern("[a-eg-mo-w]", status); 509 if (U_FAILURE(status)) { errln("FAIL"); return; } 510 set2.applyPattern("[d-q]", status); 511 if (U_FAILURE(status)) { errln("FAIL"); return; } 512 set.addAll(set2); 513 expectPairs(set, "aw"); 514 } 515 516 void UnicodeSetTest::TestAPI() { 517 UErrorCode status = U_ZERO_ERROR; 518 // default ct 519 UnicodeSet set; 520 if (!set.isEmpty() || set.getRangeCount() != 0) { 521 errln((UnicodeString)"FAIL, set should be empty but isn't: " + 522 set); 523 } 524 525 // clear(), isEmpty() 526 set.add(0x0061); 527 if (set.isEmpty()) { 528 errln((UnicodeString)"FAIL, set shouldn't be empty but is: " + 529 set); 530 } 531 set.clear(); 532 if (!set.isEmpty()) { 533 errln((UnicodeString)"FAIL, set should be empty but isn't: " + 534 set); 535 } 536 537 // size() 538 set.clear(); 539 if (set.size() != 0) { 540 errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() + 541 ": " + set); 542 } 543 set.add(0x0061); 544 if (set.size() != 1) { 545 errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() + 546 ": " + set); 547 } 548 set.add(0x0031, 0x0039); 549 if (set.size() != 10) { 550 errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() + 551 ": " + set); 552 } 553 554 // contains(first, last) 555 set.clear(); 556 set.applyPattern("[A-Y 1-8 b-d l-y]", status); 557 if (U_FAILURE(status)) { errln("FAIL"); return; } 558 for (int32_t i = 0; i<set.getRangeCount(); ++i) { 559 UChar32 a = set.getRangeStart(i); 560 UChar32 b = set.getRangeEnd(i); 561 if (!set.contains(a, b)) { 562 errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b + 563 " but doesn't: " + set); 564 } 565 if (set.contains((UChar32)(a-1), b)) { 566 errln((UnicodeString)"FAIL, shouldn't contain " + 567 (unsigned short)(a-1) + '-' + (unsigned short)b + 568 " but does: " + set); 569 } 570 if (set.contains(a, (UChar32)(b+1))) { 571 errln((UnicodeString)"FAIL, shouldn't contain " + 572 (unsigned short)a + '-' + (unsigned short)(b+1) + 573 " but does: " + set); 574 } 575 } 576 577 // Ported InversionList test. 578 UnicodeSet a((UChar32)3,(UChar32)10); 579 UnicodeSet b((UChar32)7,(UChar32)15); 580 UnicodeSet c; 581 582 logln((UnicodeString)"a [3-10]: " + a); 583 logln((UnicodeString)"b [7-15]: " + b); 584 c = a; 585 c.addAll(b); 586 UnicodeSet exp((UChar32)3,(UChar32)15); 587 if (c == exp) { 588 logln((UnicodeString)"c.set(a).add(b): " + c); 589 } else { 590 errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp); 591 } 592 c.complement(); 593 exp.set((UChar32)0, (UChar32)2); 594 exp.add((UChar32)16, UnicodeSet::MAX_VALUE); 595 if (c == exp) { 596 logln((UnicodeString)"c.complement(): " + c); 597 } else { 598 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp); 599 } 600 c.complement(); 601 exp.set((UChar32)3, (UChar32)15); 602 if (c == exp) { 603 logln((UnicodeString)"c.complement(): " + c); 604 } else { 605 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp); 606 } 607 c = a; 608 c.complementAll(b); 609 exp.set((UChar32)3,(UChar32)6); 610 exp.add((UChar32)11,(UChar32) 15); 611 if (c == exp) { 612 logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c); 613 } else { 614 errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp); 615 } 616 617 exp = c; 618 bitsToSet(setToBits(c), c); 619 if (c == exp) { 620 logln((UnicodeString)"bitsToSet(setToBits(c)): " + c); 621 } else { 622 errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp); 623 } 624 625 // Additional tests for coverage JB#2118 626 //UnicodeSet::complement(class UnicodeString const &) 627 //UnicodeSet::complementAll(class UnicodeString const &) 628 //UnicodeSet::containsNone(class UnicodeSet const &) 629 //UnicodeSet::containsNone(long,long) 630 //UnicodeSet::containsSome(class UnicodeSet const &) 631 //UnicodeSet::containsSome(long,long) 632 //UnicodeSet::removeAll(class UnicodeString const &) 633 //UnicodeSet::retain(long) 634 //UnicodeSet::retainAll(class UnicodeString const &) 635 //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &) 636 //UnicodeSetIterator::getString(void) 637 set.clear(); 638 set.complement("ab"); 639 exp.applyPattern("[{ab}]", status); 640 if (U_FAILURE(status)) { errln("FAIL"); return; } 641 if (set != exp) { errln("FAIL: complement(\"ab\")"); return; } 642 643 UnicodeSetIterator iset(set); 644 if (!iset.next() || !iset.isString()) { 645 errln("FAIL: UnicodeSetIterator::next/isString"); 646 } else if (iset.getString() != "ab") { 647 errln("FAIL: UnicodeSetIterator::getString"); 648 } 649 650 set.add((UChar32)0x61, (UChar32)0x7A); 651 set.complementAll("alan"); 652 exp.applyPattern("[{ab}b-kmo-z]", status); 653 if (U_FAILURE(status)) { errln("FAIL"); return; } 654 if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; } 655 656 exp.applyPattern("[a-z]", status); 657 if (U_FAILURE(status)) { errln("FAIL"); return; } 658 if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); } 659 if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); } 660 exp.applyPattern("[aln]", status); 661 if (U_FAILURE(status)) { errln("FAIL"); return; } 662 if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); } 663 if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); } 664 665 if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) { 666 errln("FAIL: containsNone(UChar32, UChar32)"); 667 } 668 if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) { 669 errln("FAIL: containsSome(UChar32, UChar32)"); 670 } 671 if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) { 672 errln("FAIL: containsNone(UChar32, UChar32)"); 673 } 674 if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) { 675 errln("FAIL: containsSome(UChar32, UChar32)"); 676 } 677 678 set.removeAll("liu"); 679 exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status); 680 if (U_FAILURE(status)) { errln("FAIL"); return; } 681 if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; } 682 683 set.retainAll("star"); 684 exp.applyPattern("[rst]", status); 685 if (U_FAILURE(status)) { errln("FAIL"); return; } 686 if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; } 687 688 set.retain((UChar32)0x73); 689 exp.applyPattern("[s]", status); 690 if (U_FAILURE(status)) { errln("FAIL"); return; } 691 if (set != exp) { errln("FAIL: retain('s')"); return; } 692 693 uint16_t buf[32]; 694 int32_t slen = set.serialize(buf, sizeof(buf)/sizeof(buf[0]), status); 695 if (U_FAILURE(status)) { errln("FAIL: serialize"); return; } 696 if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) { 697 errln("FAIL: serialize"); 698 return; 699 } 700 701 // Conversions to and from USet 702 UnicodeSet *uniset = &set; 703 USet *uset = uniset->toUSet(); 704 TEST_ASSERT((void *)uset == (void *)uniset); 705 UnicodeSet *setx = UnicodeSet::fromUSet(uset); 706 TEST_ASSERT((void *)setx == (void *)uset); 707 const UnicodeSet *constSet = uniset; 708 const USet *constUSet = constSet->toUSet(); 709 TEST_ASSERT((void *)constUSet == (void *)constSet); 710 const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet); 711 TEST_ASSERT((void *)constSetx == (void *)constUSet); 712 713 // span(UnicodeString) and spanBack(UnicodeString) convenience methods 714 UnicodeString longString=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccccc"); 715 UnicodeSet ac(0x61, 0x63); 716 ac.remove(0x62).freeze(); 717 if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 || 718 ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 || 719 ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 || 720 ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 || 721 ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 || 722 ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 || 723 ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 || 724 ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 || 725 ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 || 726 ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30 727 ) { 728 errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes"); 729 } 730 if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 || 731 ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 || 732 ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 || 733 ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 || 734 ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 || 735 ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 || 736 ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 || 737 ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 || 738 ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 || 739 ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20 740 ) { 741 errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes"); 742 } 743 } 744 745 void UnicodeSetTest::TestIteration() { 746 UErrorCode ec = U_ZERO_ERROR; 747 int i = 0; 748 int outerLoop; 749 750 // 6 code points, 3 ranges, 2 strings, 8 total elements 751 // Iteration will access them in sorted order - a, b, c, y, z, U0001abcd, "str1", "str2" 752 UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec); 753 TEST_ASSERT_SUCCESS(ec); 754 UnicodeSetIterator it(set); 755 756 for (outerLoop=0; outerLoop<3; outerLoop++) { 757 // Run the test multiple times, to check that iterator.reset() is working. 758 for (i=0; i<10; i++) { 759 UBool nextv = it.next(); 760 UBool isString = it.isString(); 761 int32_t codePoint = it.getCodepoint(); 762 //int32_t codePointEnd = it.getCodepointEnd(); 763 UnicodeString s = it.getString(); 764 switch (i) { 765 case 0: 766 TEST_ASSERT(nextv == TRUE); 767 TEST_ASSERT(isString == FALSE); 768 TEST_ASSERT(codePoint==0x61); 769 TEST_ASSERT(s == "a"); 770 break; 771 case 1: 772 TEST_ASSERT(nextv == TRUE); 773 TEST_ASSERT(isString == FALSE); 774 TEST_ASSERT(codePoint==0x62); 775 TEST_ASSERT(s == "b"); 776 break; 777 case 2: 778 TEST_ASSERT(nextv == TRUE); 779 TEST_ASSERT(isString == FALSE); 780 TEST_ASSERT(codePoint==0x63); 781 TEST_ASSERT(s == "c"); 782 break; 783 case 3: 784 TEST_ASSERT(nextv == TRUE); 785 TEST_ASSERT(isString == FALSE); 786 TEST_ASSERT(codePoint==0x79); 787 TEST_ASSERT(s == "y"); 788 break; 789 case 4: 790 TEST_ASSERT(nextv == TRUE); 791 TEST_ASSERT(isString == FALSE); 792 TEST_ASSERT(codePoint==0x7a); 793 TEST_ASSERT(s == "z"); 794 break; 795 case 5: 796 TEST_ASSERT(nextv == TRUE); 797 TEST_ASSERT(isString == FALSE); 798 TEST_ASSERT(codePoint==0x1abcd); 799 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd)); 800 break; 801 case 6: 802 TEST_ASSERT(nextv == TRUE); 803 TEST_ASSERT(isString == TRUE); 804 TEST_ASSERT(s == "str1"); 805 break; 806 case 7: 807 TEST_ASSERT(nextv == TRUE); 808 TEST_ASSERT(isString == TRUE); 809 TEST_ASSERT(s == "str2"); 810 break; 811 case 8: 812 TEST_ASSERT(nextv == FALSE); 813 break; 814 case 9: 815 TEST_ASSERT(nextv == FALSE); 816 break; 817 } 818 } 819 it.reset(); // prepare to run the iteration again. 820 } 821 } 822 823 824 825 826 void UnicodeSetTest::TestStrings() { 827 UErrorCode ec = U_ZERO_ERROR; 828 829 UnicodeSet* testList[] = { 830 UnicodeSet::createFromAll("abc"), 831 new UnicodeSet("[a-c]", ec), 832 833 &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")), 834 new UnicodeSet("[{ll}{ch}a-z]", ec), 835 836 UnicodeSet::createFrom("ab}c"), 837 new UnicodeSet("[{ab\\}c}]", ec), 838 839 &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')), 840 new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec), 841 842 NULL 843 }; 844 845 if (U_FAILURE(ec)) { 846 errln("FAIL: couldn't construct test sets"); 847 } 848 849 for (int32_t i = 0; testList[i] != NULL; i+=2) { 850 if (U_SUCCESS(ec)) { 851 UnicodeString pat0, pat1; 852 testList[i]->toPattern(pat0, TRUE); 853 testList[i+1]->toPattern(pat1, TRUE); 854 if (*testList[i] == *testList[i+1]) { 855 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1); 856 } else { 857 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1); 858 } 859 } 860 delete testList[i]; 861 delete testList[i+1]; 862 } 863 } 864 865 /** 866 * Test the [:Latin:] syntax. 867 */ 868 void UnicodeSetTest::TestScriptSet() { 869 expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1")); 870 871 expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA"); 872 873 /* Jitterbug 1423 */ 874 expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA"); 875 876 } 877 878 /** 879 * Test the [:Latin:] syntax. 880 */ 881 void UnicodeSetTest::TestPropertySet() { 882 static const char* const DATA[] = { 883 // Pattern, Chars IN, Chars NOT in 884 885 "[:Latin:]", 886 "aA", 887 "\\u0391\\u03B1", 888 889 "[\\p{Greek}]", 890 "\\u0391\\u03B1", 891 "aA", 892 893 "\\P{ GENERAL Category = upper case letter }", 894 "abc", 895 "ABC", 896 897 #if !UCONFIG_NO_NORMALIZATION 898 // Combining class: @since ICU 2.2 899 // Check both symbolic and numeric 900 "\\p{ccc=Nukta}", 901 "\\u0ABC", 902 "abc", 903 904 "\\p{Canonical Combining Class = 11}", 905 "\\u05B1", 906 "\\u05B2", 907 908 "[:c c c = iota subscript :]", 909 "\\u0345", 910 "xyz", 911 #endif 912 913 // Bidi class: @since ICU 2.2 914 "\\p{bidiclass=lefttoright}", 915 "abc", 916 "\\u0671\\u0672", 917 918 // Binary properties: @since ICU 2.2 919 "\\p{ideographic}", 920 "\\u4E0A", 921 "x", 922 923 "[:math=false:]", 924 "q)*(", 925 // weiv: )(and * were removed from math in Unicode 4.0.1 926 //"(*+)", 927 "+<>^", 928 929 // JB#1767 \N{}, \p{ASCII} 930 "[:Ascii:]", 931 "abc\\u0000\\u007F", 932 "\\u0080\\u4E00", 933 934 "[\\N{ latin small letter a }[:name= latin small letter z:]]", 935 "az", 936 "qrs", 937 938 // JB#2015 939 "[:any:]", 940 "a\\U0010FFFF", 941 "", 942 943 "[:nv=0.5:]", 944 "\\u00BD\\u0F2A", 945 "\\u00BC", 946 947 // JB#2653: Age 948 "[:Age=1.1:]", 949 "\\u03D6", // 1.1 950 "\\u03D8\\u03D9", // 3.2 951 952 "[:Age=3.1:]", 953 "\\u1800\\u3400\\U0002f800", 954 "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000", 955 956 // JB#2350: Case_Sensitive 957 "[:Case Sensitive:]", 958 "A\\u1FFC\\U00010410", 959 ";\\u00B4\\U00010500", 960 961 // JB#2832: C99-compatibility props 962 "[:blank:]", 963 " \\u0009", 964 "1-9A-Z", 965 966 "[:graph:]", 967 "19AZ", 968 " \\u0003\\u0007\\u0009\\u000A\\u000D", 969 970 "[:punct:]", 971 "!@#%&*()[]{}-_\\/;:,.?'\"", 972 "09azAZ", 973 974 "[:xdigit:]", 975 "09afAF", 976 "gG!", 977 978 // Regex compatibility test 979 "[-b]", // leading '-' is literal 980 "-b", 981 "ac", 982 983 "[^-b]", // leading '-' is literal 984 "ac", 985 "-b", 986 987 "[b-]", // trailing '-' is literal 988 "-b", 989 "ac", 990 991 "[^b-]", // trailing '-' is literal 992 "ac", 993 "-b", 994 995 "[a-b-]", // trailing '-' is literal 996 "ab-", 997 "c=", 998 999 "[[a-q]&[p-z]-]", // trailing '-' is literal 1000 "pq-", 1001 "or=", 1002 1003 "[\\s|\\)|:|$|\\>]", // from regex tests 1004 "s|):$>", 1005 "abc", 1006 1007 "[\\uDC00cd]", // JB#2906: isolated trail at start 1008 "cd\\uDC00", 1009 "ab\\uD800\\U00010000", 1010 1011 "[ab\\uD800]", // JB#2906: isolated trail at start 1012 "ab\\uD800", 1013 "cd\\uDC00\\U00010000", 1014 1015 "[ab\\uD800cd]", // JB#2906: isolated lead in middle 1016 "abcd\\uD800", 1017 "ef\\uDC00\\U00010000", 1018 1019 "[ab\\uDC00cd]", // JB#2906: isolated trail in middle 1020 "abcd\\uDC00", 1021 "ef\\uD800\\U00010000", 1022 1023 #if !UCONFIG_NO_NORMALIZATION 1024 "[:^lccc=0:]", // Lead canonical class 1025 "\\u0300\\u0301", 1026 "abcd\\u00c0\\u00c5", 1027 1028 "[:^tccc=0:]", // Trail canonical class 1029 "\\u0300\\u0301\\u00c0\\u00c5", 1030 "abcd", 1031 1032 "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class 1033 "\\u0300\\u0301\\u00c0\\u00c5", 1034 "abcd", 1035 1036 "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now) 1037 "", 1038 "abcd\\u0300\\u0301\\u00c0\\u00c5", 1039 1040 "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not 1041 "\\u0F73\\u0F75\\u0F81", 1042 "abcd\\u0300\\u0301\\u00c0\\u00c5", 1043 #endif /* !UCONFIG_NO_NORMALIZATION */ 1044 1045 "[:Assigned:]", 1046 "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD", 1047 "\\u0888\\uFDD3\\uFFFE\\U00050005", 1048 1049 // Script_Extensions, new in Unicode 6.0 1050 "[:scx=Arab:]", 1051 "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3", 1052 "\\u061D\\u065F\\uFDEF\\uFDFE", 1053 1054 // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions, 1055 // so scx-sc is missing U+FDF2. 1056 "[[:Script_Extensions=Arabic:]-[:Arab:]]", 1057 "\\u0640\\u064B\\u0650\\u0655\\uFDFD", 1058 "\\uFDF2" 1059 }; 1060 1061 static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]); 1062 1063 for (int32_t i=0; i<DATA_LEN; i+=3) { 1064 expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]), 1065 CharsToUnicodeString(DATA[i+2])); 1066 } 1067 } 1068 1069 /** 1070 * Test that Posix style character classes [:digit:], etc. 1071 * have the Unicode definitions from TR 18. 1072 */ 1073 void UnicodeSetTest::TestPosixClasses() { 1074 { 1075 UErrorCode status = U_ZERO_ERROR; 1076 UnicodeSet s1("[:alpha:]", status); 1077 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status); 1078 TEST_ASSERT_SUCCESS(status); 1079 TEST_ASSERT(s1==s2); 1080 } 1081 { 1082 UErrorCode status = U_ZERO_ERROR; 1083 UnicodeSet s1("[:lower:]", status); 1084 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status); 1085 TEST_ASSERT_SUCCESS(status); 1086 TEST_ASSERT(s1==s2); 1087 } 1088 { 1089 UErrorCode status = U_ZERO_ERROR; 1090 UnicodeSet s1("[:upper:]", status); 1091 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status); 1092 TEST_ASSERT_SUCCESS(status); 1093 TEST_ASSERT(s1==s2); 1094 } 1095 { 1096 UErrorCode status = U_ZERO_ERROR; 1097 UnicodeSet s1("[:punct:]", status); 1098 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status); 1099 TEST_ASSERT_SUCCESS(status); 1100 TEST_ASSERT(s1==s2); 1101 } 1102 { 1103 UErrorCode status = U_ZERO_ERROR; 1104 UnicodeSet s1("[:digit:]", status); 1105 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status); 1106 TEST_ASSERT_SUCCESS(status); 1107 TEST_ASSERT(s1==s2); 1108 } 1109 { 1110 UErrorCode status = U_ZERO_ERROR; 1111 UnicodeSet s1("[:xdigit:]", status); 1112 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status); 1113 TEST_ASSERT_SUCCESS(status); 1114 TEST_ASSERT(s1==s2); 1115 } 1116 { 1117 UErrorCode status = U_ZERO_ERROR; 1118 UnicodeSet s1("[:alnum:]", status); 1119 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status); 1120 TEST_ASSERT_SUCCESS(status); 1121 TEST_ASSERT(s1==s2); 1122 } 1123 { 1124 UErrorCode status = U_ZERO_ERROR; 1125 UnicodeSet s1("[:space:]", status); 1126 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status); 1127 TEST_ASSERT_SUCCESS(status); 1128 TEST_ASSERT(s1==s2); 1129 } 1130 { 1131 UErrorCode status = U_ZERO_ERROR; 1132 UnicodeSet s1("[:blank:]", status); 1133 TEST_ASSERT_SUCCESS(status); 1134 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"), 1135 status); 1136 TEST_ASSERT_SUCCESS(status); 1137 TEST_ASSERT(s1==s2); 1138 } 1139 { 1140 UErrorCode status = U_ZERO_ERROR; 1141 UnicodeSet s1("[:cntrl:]", status); 1142 TEST_ASSERT_SUCCESS(status); 1143 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status); 1144 TEST_ASSERT_SUCCESS(status); 1145 TEST_ASSERT(s1==s2); 1146 } 1147 { 1148 UErrorCode status = U_ZERO_ERROR; 1149 UnicodeSet s1("[:graph:]", status); 1150 TEST_ASSERT_SUCCESS(status); 1151 UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status); 1152 TEST_ASSERT_SUCCESS(status); 1153 TEST_ASSERT(s1==s2); 1154 } 1155 { 1156 UErrorCode status = U_ZERO_ERROR; 1157 UnicodeSet s1("[:print:]", status); 1158 TEST_ASSERT_SUCCESS(status); 1159 UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status); 1160 TEST_ASSERT_SUCCESS(status); 1161 TEST_ASSERT(s1==s2); 1162 } 1163 } 1164 /** 1165 * Test cloning of UnicodeSet. For C++, we test the copy constructor. 1166 */ 1167 void UnicodeSetTest::TestClone() { 1168 UErrorCode ec = U_ZERO_ERROR; 1169 UnicodeSet s("[abcxyz]", ec); 1170 UnicodeSet t(s); 1171 expectContainment(t, "abc", "def"); 1172 } 1173 1174 /** 1175 * Test the indexOf() and charAt() methods. 1176 */ 1177 void UnicodeSetTest::TestIndexOf() { 1178 UErrorCode ec = U_ZERO_ERROR; 1179 UnicodeSet set("[a-cx-y3578]", ec); 1180 if (U_FAILURE(ec)) { 1181 errln("FAIL: UnicodeSet constructor"); 1182 return; 1183 } 1184 for (int32_t i=0; i<set.size(); ++i) { 1185 UChar32 c = set.charAt(i); 1186 if (set.indexOf(c) != i) { 1187 errln("FAIL: charAt(%d) = %X => indexOf() => %d", 1188 i, c, set.indexOf(c)); 1189 } 1190 } 1191 UChar32 c = set.charAt(set.size()); 1192 if (c != -1) { 1193 errln("FAIL: charAt(<out of range>) = %X", c); 1194 } 1195 int32_t j = set.indexOf((UChar32)0x71/*'q'*/); 1196 if (j != -1) { 1197 errln((UnicodeString)"FAIL: indexOf('q') = " + j); 1198 } 1199 } 1200 1201 /** 1202 * Test closure API. 1203 */ 1204 void UnicodeSetTest::TestCloseOver() { 1205 UErrorCode ec = U_ZERO_ERROR; 1206 1207 char CASE[] = {(char)USET_CASE_INSENSITIVE}; 1208 char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS}; 1209 const char* DATA[] = { 1210 // selector, input, output 1211 CASE, 1212 "[aq\\u00DF{Bc}{bC}{Fi}]", 1213 "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]", // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1 1214 1215 CASE, 1216 "[\\u01F1]", // 'DZ' 1217 "[\\u01F1\\u01F2\\u01F3]", 1218 1219 CASE, 1220 "[\\u1FB4]", 1221 "[\\u1FB4{\\u03AC\\u03B9}]", 1222 1223 CASE, 1224 "[{F\\uFB01}]", 1225 "[\\uFB03{ffi}]", 1226 1227 CASE, // make sure binary search finds limits 1228 "[a\\uFF3A]", 1229 "[aA\\uFF3A\\uFF5A]", 1230 1231 CASE, 1232 "[a-z]","[A-Za-z\\u017F\\u212A]", 1233 CASE, 1234 "[abc]","[A-Ca-c]", 1235 CASE, 1236 "[ABC]","[A-Ca-c]", 1237 1238 CASE, "[i]", "[iI]", 1239 1240 CASE, "[\\u0130]", "[\\u0130{i\\u0307}]", // dotted I 1241 CASE, "[{i\\u0307}]", "[\\u0130{i\\u0307}]", // i with dot 1242 1243 CASE, "[\\u0131]", "[\\u0131]", // dotless i 1244 1245 CASE, "[\\u0390]", "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]", 1246 1247 CASE, "[\\u03c2]", "[\\u03a3\\u03c2\\u03c3]", // sigmas 1248 1249 CASE, "[\\u03f2]", "[\\u03f2\\u03f9]", // lunate sigmas 1250 1251 CASE, "[\\u03f7]", "[\\u03f7\\u03f8]", 1252 1253 CASE, "[\\u1fe3]", "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]", 1254 1255 CASE, "[\\ufb05]", "[\\ufb05\\ufb06{st}]", 1256 CASE, "[{st}]", "[\\ufb05\\ufb06{st}]", 1257 1258 CASE, "[\\U0001044F]", "[\\U00010427\\U0001044F]", 1259 1260 CASE, "[{a\\u02BE}]", "[\\u1E9A{a\\u02BE}]", // first in sorted table 1261 1262 CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table 1263 1264 #if !UCONFIG_NO_FILE_IO 1265 CASE_MAPPINGS, 1266 "[aq\\u00DF{Bc}{bC}{Fi}]", 1267 "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]", 1268 #endif 1269 1270 CASE_MAPPINGS, 1271 "[\\u01F1]", // 'DZ' 1272 "[\\u01F1\\u01F2\\u01F3]", 1273 1274 CASE_MAPPINGS, 1275 "[a-z]", 1276 "[A-Za-z]", 1277 1278 NULL 1279 }; 1280 1281 UnicodeSet s; 1282 UnicodeSet t; 1283 UnicodeString buf; 1284 for (int32_t i=0; DATA[i]!=NULL; i+=3) { 1285 int32_t selector = DATA[i][0]; 1286 UnicodeString pat(DATA[i+1], -1, US_INV); 1287 UnicodeString exp(DATA[i+2], -1, US_INV); 1288 s.applyPattern(pat, ec); 1289 s.closeOver(selector); 1290 t.applyPattern(exp, ec); 1291 if (U_FAILURE(ec)) { 1292 errln("FAIL: applyPattern failed"); 1293 continue; 1294 } 1295 if (s == t) { 1296 logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp); 1297 } else { 1298 dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " + 1299 s.toPattern(buf, TRUE) + ", expected " + exp); 1300 } 1301 } 1302 1303 #if 0 1304 /* 1305 * Unused test code. 1306 * This was used to compare the old implementation (using USET_CASE) 1307 * with the new one (using 0x100 temporarily) 1308 * while transitioning from hardcoded case closure tables in uniset.cpp 1309 * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu. 1310 * and using ucase.c functions for closure. 1311 * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file 1312 * 1313 * Note: The old and new implementation never fully matched because 1314 * the old implementation turned out to not map U+0130 and U+0131 correctly 1315 * (dotted I and dotless i) and because the old implementation's data tables 1316 * were outdated compared to Unicode 4.0.1 at the time of the change to the 1317 * new implementation. (So sigmas and some other characters were not handled 1318 * according to the newer Unicode version.) 1319 */ 1320 UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2; 1321 UnicodeSetIterator si(sens); 1322 UnicodeString str, buf2; 1323 const UnicodeString *pStr; 1324 UChar32 c; 1325 while(si.next()) { 1326 if(!si.isString()) { 1327 c=si.getCodepoint(); 1328 s.clear(); 1329 s.add(c); 1330 1331 str.setTo(c); 1332 str.foldCase(); 1333 sens2.add(str); 1334 1335 t=s; 1336 s.closeOver(USET_CASE); 1337 t.closeOver(0x100); 1338 if(s!=t) { 1339 errln("FAIL: closeOver(U+%04x) differs: ", c); 1340 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE)); 1341 } 1342 } 1343 } 1344 // remove all code points 1345 // should contain all full case folding mapping strings 1346 sens2.remove(0, 0x10ffff); 1347 si.reset(sens2); 1348 while(si.next()) { 1349 if(si.isString()) { 1350 pStr=&si.getString(); 1351 s.clear(); 1352 s.add(*pStr); 1353 t=s2=s; 1354 s.closeOver(USET_CASE); 1355 t.closeOver(0x100); 1356 if(s!=t) { 1357 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: "); 1358 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE)); 1359 } 1360 } 1361 } 1362 #endif 1363 1364 // Test the pattern API 1365 s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec); 1366 if (U_FAILURE(ec)) { 1367 errln("FAIL: applyPattern failed"); 1368 } else { 1369 expectContainment(s, "abcABC", "defDEF"); 1370 } 1371 UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec); 1372 if (U_FAILURE(ec)) { 1373 errln("FAIL: constructor failed"); 1374 } else { 1375 expectContainment(v, "defDEF", "abcABC"); 1376 } 1377 UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec); 1378 if (U_FAILURE(ec)) { 1379 errln("FAIL: construct w/case mappings failed"); 1380 } else { 1381 expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A")); 1382 } 1383 } 1384 1385 void UnicodeSetTest::TestEscapePattern() { 1386 const char pattern[] = 1387 "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]"; 1388 const char exp[] = 1389 "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]"; 1390 // We test this with two passes; in the second pass we 1391 // pre-unescape the pattern. Since U+200E is rule whitespace, 1392 // this fails -- which is what we expect. 1393 for (int32_t pass=1; pass<=2; ++pass) { 1394 UErrorCode ec = U_ZERO_ERROR; 1395 UnicodeString pat(pattern, -1, US_INV); 1396 if (pass==2) { 1397 pat = pat.unescape(); 1398 } 1399 // Pattern is only good for pass 1 1400 UBool isPatternValid = (pass==1); 1401 1402 UnicodeSet set(pat, ec); 1403 if (U_SUCCESS(ec) != isPatternValid){ 1404 errln((UnicodeString)"FAIL: applyPattern(" + 1405 escape(pat) + ") => " + 1406 u_errorName(ec)); 1407 continue; 1408 } 1409 if (U_FAILURE(ec)) { 1410 continue; 1411 } 1412 if (set.contains((UChar)0x0644)){ 1413 errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)"); 1414 } 1415 1416 UnicodeString newpat; 1417 set.toPattern(newpat, TRUE); 1418 if (newpat == UnicodeString(exp, -1, US_INV)) { 1419 logln(escape(pat) + " => " + newpat); 1420 } else { 1421 errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat); 1422 } 1423 1424 for (int32_t i=0; i<set.getRangeCount(); ++i) { 1425 UnicodeString str("Range "); 1426 str.append((UChar)(0x30 + i)) 1427 .append(": ") 1428 .append((UChar32)set.getRangeStart(i)) 1429 .append(" - ") 1430 .append((UChar32)set.getRangeEnd(i)); 1431 str = str + " (" + set.getRangeStart(i) + " - " + 1432 set.getRangeEnd(i) + ")"; 1433 if (set.getRangeStart(i) < 0) { 1434 errln((UnicodeString)"FAIL: " + escape(str)); 1435 } else { 1436 logln(escape(str)); 1437 } 1438 } 1439 } 1440 } 1441 1442 void UnicodeSetTest::expectRange(const UnicodeString& label, 1443 const UnicodeSet& set, 1444 UChar32 start, UChar32 end) { 1445 UnicodeSet exp(start, end); 1446 UnicodeString pat; 1447 if (set == exp) { 1448 logln(label + " => " + set.toPattern(pat, TRUE)); 1449 } else { 1450 UnicodeString xpat; 1451 errln((UnicodeString)"FAIL: " + label + " => " + 1452 set.toPattern(pat, TRUE) + 1453 ", expected " + exp.toPattern(xpat, TRUE)); 1454 } 1455 } 1456 1457 void UnicodeSetTest::TestInvalidCodePoint() { 1458 1459 const UChar32 DATA[] = { 1460 // Test range Expected range 1461 0, 0x10FFFF, 0, 0x10FFFF, 1462 (UChar32)-1, 8, 0, 8, 1463 8, 0x110000, 8, 0x10FFFF 1464 }; 1465 const int32_t DATA_LENGTH = sizeof(DATA)/sizeof(DATA[0]); 1466 1467 UnicodeString pat; 1468 int32_t i; 1469 1470 for (i=0; i<DATA_LENGTH; i+=4) { 1471 UChar32 start = DATA[i]; 1472 UChar32 end = DATA[i+1]; 1473 UChar32 xstart = DATA[i+2]; 1474 UChar32 xend = DATA[i+3]; 1475 1476 // Try various API using the test code points 1477 1478 UnicodeSet set(start, end); 1479 expectRange((UnicodeString)"ct(" + start + "," + end + ")", 1480 set, xstart, xend); 1481 1482 set.clear(); 1483 set.set(start, end); 1484 expectRange((UnicodeString)"set(" + start + "," + end + ")", 1485 set, xstart, xend); 1486 1487 UBool b = set.contains(start); 1488 b = set.contains(start, end); 1489 b = set.containsNone(start, end); 1490 b = set.containsSome(start, end); 1491 1492 /*int32_t index = set.indexOf(start);*/ 1493 1494 set.clear(); 1495 set.add(start); 1496 set.add(start, end); 1497 expectRange((UnicodeString)"add(" + start + "," + end + ")", 1498 set, xstart, xend); 1499 1500 set.set(0, 0x10FFFF); 1501 set.retain(start, end); 1502 expectRange((UnicodeString)"retain(" + start + "," + end + ")", 1503 set, xstart, xend); 1504 set.retain(start); 1505 1506 set.set(0, 0x10FFFF); 1507 set.remove(start); 1508 set.remove(start, end); 1509 set.complement(); 1510 expectRange((UnicodeString)"!remove(" + start + "," + end + ")", 1511 set, xstart, xend); 1512 1513 set.set(0, 0x10FFFF); 1514 set.complement(start, end); 1515 set.complement(); 1516 expectRange((UnicodeString)"!complement(" + start + "," + end + ")", 1517 set, xstart, xend); 1518 set.complement(start); 1519 } 1520 1521 const UChar32 DATA2[] = { 1522 0, 1523 0x10FFFF, 1524 (UChar32)-1, 1525 0x110000 1526 }; 1527 const int32_t DATA2_LENGTH = sizeof(DATA2)/sizeof(DATA2[0]); 1528 1529 for (i=0; i<DATA2_LENGTH; ++i) { 1530 UChar32 c = DATA2[i], end = 0x10FFFF; 1531 UBool valid = (c >= 0 && c <= 0x10FFFF); 1532 1533 UnicodeSet set(0, 0x10FFFF); 1534 1535 // For single-codepoint contains, invalid codepoints are NOT contained 1536 UBool b = set.contains(c); 1537 if (b == valid) { 1538 logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c + 1539 ") = " + b); 1540 } else { 1541 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c + 1542 ") = " + b); 1543 } 1544 1545 // For codepoint range contains, containsNone, and containsSome, 1546 // invalid or empty (start > end) ranges have UNDEFINED behavior. 1547 b = set.contains(c, end); 1548 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c + 1549 "," + end + ") = " + b); 1550 1551 b = set.containsNone(c, end); 1552 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c + 1553 "," + end + ") = " + b); 1554 1555 b = set.containsSome(c, end); 1556 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c + 1557 "," + end + ") = " + b); 1558 1559 int32_t index = set.indexOf(c); 1560 if ((index >= 0) == valid) { 1561 logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c + 1562 ") = " + index); 1563 } else { 1564 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c + 1565 ") = " + index); 1566 } 1567 } 1568 } 1569 1570 // Used by TestSymbolTable 1571 class TokenSymbolTable : public SymbolTable { 1572 public: 1573 Hashtable contents; 1574 1575 TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) { 1576 contents.setValueDeleter(uhash_deleteUnicodeString); 1577 } 1578 1579 ~TokenSymbolTable() {} 1580 1581 /** 1582 * (Non-SymbolTable API) Add the given variable and value to 1583 * the table. Variable should NOT contain leading '$'. 1584 */ 1585 void add(const UnicodeString& var, const UnicodeString& value, 1586 UErrorCode& ec) { 1587 if (U_SUCCESS(ec)) { 1588 contents.put(var, new UnicodeString(value), ec); 1589 } 1590 } 1591 1592 /** 1593 * SymbolTable API 1594 */ 1595 virtual const UnicodeString* lookup(const UnicodeString& s) const { 1596 return (const UnicodeString*) contents.get(s); 1597 } 1598 1599 /** 1600 * SymbolTable API 1601 */ 1602 virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const { 1603 return NULL; 1604 } 1605 1606 /** 1607 * SymbolTable API 1608 */ 1609 virtual UnicodeString parseReference(const UnicodeString& text, 1610 ParsePosition& pos, int32_t limit) const { 1611 int32_t start = pos.getIndex(); 1612 int32_t i = start; 1613 UnicodeString result; 1614 while (i < limit) { 1615 UChar c = text.charAt(i); 1616 if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) { 1617 break; 1618 } 1619 ++i; 1620 } 1621 if (i == start) { // No valid name chars 1622 return result; // Indicate failure with empty string 1623 } 1624 pos.setIndex(i); 1625 text.extractBetween(start, i, result); 1626 return result; 1627 } 1628 }; 1629 1630 void UnicodeSetTest::TestSymbolTable() { 1631 // Multiple test cases can be set up here. Each test case 1632 // is terminated by null: 1633 // var, value, var, value,..., input pat., exp. output pat., null 1634 const char* DATA[] = { 1635 "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL, 1636 "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL, 1637 "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL, 1638 NULL 1639 }; 1640 1641 for (int32_t i=0; DATA[i]!=NULL; ++i) { 1642 UErrorCode ec = U_ZERO_ERROR; 1643 TokenSymbolTable sym(ec); 1644 if (U_FAILURE(ec)) { 1645 errln("FAIL: couldn't construct TokenSymbolTable"); 1646 continue; 1647 } 1648 1649 // Set up variables 1650 while (DATA[i+2] != NULL) { 1651 sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec); 1652 if (U_FAILURE(ec)) { 1653 errln("FAIL: couldn't add to TokenSymbolTable"); 1654 continue; 1655 } 1656 i += 2; 1657 } 1658 1659 // Input pattern and expected output pattern 1660 UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV); 1661 i += 2; 1662 1663 ParsePosition pos(0); 1664 UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec); 1665 if (U_FAILURE(ec)) { 1666 errln("FAIL: couldn't construct UnicodeSet"); 1667 continue; 1668 } 1669 1670 // results 1671 if (pos.getIndex() != inpat.length()) { 1672 errln((UnicodeString)"Failed to read to end of string \"" 1673 + inpat + "\": read to " 1674 + pos.getIndex() + ", length is " 1675 + inpat.length()); 1676 } 1677 1678 UnicodeSet us2(exppat, ec); 1679 if (U_FAILURE(ec)) { 1680 errln("FAIL: couldn't construct expected UnicodeSet"); 1681 continue; 1682 } 1683 1684 UnicodeString a, b; 1685 if (us != us2) { 1686 errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) + 1687 ", expected " + us2.toPattern(b, TRUE)); 1688 } else { 1689 logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE)); 1690 } 1691 } 1692 } 1693 1694 void UnicodeSetTest::TestSurrogate() { 1695 const char* DATA[] = { 1696 // These should all behave identically 1697 "[abc\\uD800\\uDC00]", 1698 // "[abc\uD800\uDC00]", // Can't do this on C -- only Java 1699 "[abc\\U00010000]", 1700 0 1701 }; 1702 for (int i=0; DATA[i] != 0; ++i) { 1703 UErrorCode ec = U_ZERO_ERROR; 1704 logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV)); 1705 UnicodeString str = UnicodeString(DATA[i], -1, US_INV); 1706 UnicodeSet set(str, ec); 1707 if (U_FAILURE(ec)) { 1708 errln("FAIL: UnicodeSet constructor"); 1709 continue; 1710 } 1711 expectContainment(set, 1712 CharsToUnicodeString("abc\\U00010000"), 1713 CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair 1714 if (set.size() != 4) { 1715 errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " + 1716 set.size() + ", expected 4"); 1717 } 1718 } 1719 } 1720 1721 void UnicodeSetTest::TestExhaustive() { 1722 // exhaustive tests. Simulate UnicodeSets with integers. 1723 // That gives us very solid tests (except for large memory tests). 1724 1725 int32_t limit = 128; 1726 1727 UnicodeSet x, y, z, aa; 1728 1729 for (int32_t i = 0; i < limit; ++i) { 1730 bitsToSet(i, x); 1731 logln((UnicodeString)"Testing " + i + ", " + x); 1732 _testComplement(i, x, y); 1733 1734 // AS LONG AS WE ARE HERE, check roundtrip 1735 checkRoundTrip(bitsToSet(i, aa)); 1736 1737 for (int32_t j = 0; j < limit; ++j) { 1738 _testAdd(i,j, x,y,z); 1739 _testXor(i,j, x,y,z); 1740 _testRetain(i,j, x,y,z); 1741 _testRemove(i,j, x,y,z); 1742 } 1743 } 1744 } 1745 1746 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) { 1747 bitsToSet(a, x); 1748 z = x; 1749 z.complement(); 1750 int32_t c = setToBits(z); 1751 if (c != (~a)) { 1752 errln((UnicodeString)"FAILED: add: ~" + x + " != " + z); 1753 errln((UnicodeString)"FAILED: add: ~" + a + " != " + c); 1754 } 1755 checkCanonicalRep(z, (UnicodeString)"complement " + a); 1756 } 1757 1758 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) { 1759 bitsToSet(a, x); 1760 bitsToSet(b, y); 1761 z = x; 1762 z.addAll(y); 1763 int32_t c = setToBits(z); 1764 if (c != (a | b)) { 1765 errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z); 1766 errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c); 1767 } 1768 checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b); 1769 } 1770 1771 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) { 1772 bitsToSet(a, x); 1773 bitsToSet(b, y); 1774 z = x; 1775 z.retainAll(y); 1776 int32_t c = setToBits(z); 1777 if (c != (a & b)) { 1778 errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z); 1779 errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c); 1780 } 1781 checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b); 1782 } 1783 1784 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) { 1785 bitsToSet(a, x); 1786 bitsToSet(b, y); 1787 z = x; 1788 z.removeAll(y); 1789 int32_t c = setToBits(z); 1790 if (c != (a &~ b)) { 1791 errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z); 1792 errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c); 1793 } 1794 checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b); 1795 } 1796 1797 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) { 1798 bitsToSet(a, x); 1799 bitsToSet(b, y); 1800 z = x; 1801 z.complementAll(y); 1802 int32_t c = setToBits(z); 1803 if (c != (a ^ b)) { 1804 errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z); 1805 errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c); 1806 } 1807 checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b); 1808 } 1809 1810 /** 1811 * Check that ranges are monotonically increasing and non- 1812 * overlapping. 1813 */ 1814 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) { 1815 int32_t n = set.getRangeCount(); 1816 if (n < 0) { 1817 errln((UnicodeString)"FAIL result of " + msg + 1818 ": range count should be >= 0 but is " + 1819 n /*+ " for " + set.toPattern())*/); 1820 return; 1821 } 1822 UChar32 last = 0; 1823 for (int32_t i=0; i<n; ++i) { 1824 UChar32 start = set.getRangeStart(i); 1825 UChar32 end = set.getRangeEnd(i); 1826 if (start > end) { 1827 errln((UnicodeString)"FAIL result of " + msg + 1828 ": range " + (i+1) + 1829 " start > end: " + (int)start + ", " + (int)end + 1830 " for " + set); 1831 } 1832 if (i > 0 && start <= last) { 1833 errln((UnicodeString)"FAIL result of " + msg + 1834 ": range " + (i+1) + 1835 " overlaps previous range: " + (int)start + ", " + (int)end + 1836 " for " + set); 1837 } 1838 last = end; 1839 } 1840 } 1841 1842 /** 1843 * Convert a bitmask to a UnicodeSet. 1844 */ 1845 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) { 1846 result.clear(); 1847 for (UChar32 i = 0; i < 32; ++i) { 1848 if ((a & (1<<i)) != 0) { 1849 result.add(i); 1850 } 1851 } 1852 return result; 1853 } 1854 1855 /** 1856 * Convert a UnicodeSet to a bitmask. Only the characters 1857 * U+0000 to U+0020 are represented in the bitmask. 1858 */ 1859 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) { 1860 int32_t result = 0; 1861 for (int32_t i = 0; i < 32; ++i) { 1862 if (x.contains((UChar32)i)) { 1863 result |= (1<<i); 1864 } 1865 } 1866 return result; 1867 } 1868 1869 /** 1870 * Return the representation of an inversion list based UnicodeSet 1871 * as a pairs list. Ranges are listed in ascending Unicode order. 1872 * For example, the set [a-zA-M3] is represented as "33AMaz". 1873 */ 1874 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) { 1875 UnicodeString pairs; 1876 for (int32_t i=0; i<set.getRangeCount(); ++i) { 1877 UChar32 start = set.getRangeStart(i); 1878 UChar32 end = set.getRangeEnd(i); 1879 if (end > 0xFFFF) { 1880 end = 0xFFFF; 1881 i = set.getRangeCount(); // Should be unnecessary 1882 } 1883 pairs.append((UChar)start).append((UChar)end); 1884 } 1885 return pairs; 1886 } 1887 1888 /** 1889 * Basic consistency check for a few items. 1890 * That the iterator works, and that we can create a pattern and 1891 * get the same thing back 1892 */ 1893 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) { 1894 UErrorCode ec = U_ZERO_ERROR; 1895 1896 UnicodeSet t(s); 1897 checkEqual(s, t, "copy ct"); 1898 1899 t = s; 1900 checkEqual(s, t, "operator="); 1901 1902 copyWithIterator(t, s, FALSE); 1903 checkEqual(s, t, "iterator roundtrip"); 1904 1905 copyWithIterator(t, s, TRUE); // try range 1906 checkEqual(s, t, "iterator roundtrip"); 1907 1908 UnicodeString pat; s.toPattern(pat, FALSE); 1909 t.applyPattern(pat, ec); 1910 if (U_FAILURE(ec)) { 1911 errln("FAIL: applyPattern"); 1912 return; 1913 } else { 1914 checkEqual(s, t, "toPattern(false)"); 1915 } 1916 1917 s.toPattern(pat, TRUE); 1918 t.applyPattern(pat, ec); 1919 if (U_FAILURE(ec)) { 1920 errln("FAIL: applyPattern"); 1921 return; 1922 } else { 1923 checkEqual(s, t, "toPattern(true)"); 1924 } 1925 } 1926 1927 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) { 1928 t.clear(); 1929 UnicodeSetIterator it(s); 1930 if (withRange) { 1931 while (it.nextRange()) { 1932 if (it.isString()) { 1933 t.add(it.getString()); 1934 } else { 1935 t.add(it.getCodepoint(), it.getCodepointEnd()); 1936 } 1937 } 1938 } else { 1939 while (it.next()) { 1940 if (it.isString()) { 1941 t.add(it.getString()); 1942 } else { 1943 t.add(it.getCodepoint()); 1944 } 1945 } 1946 } 1947 } 1948 1949 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) { 1950 UnicodeString source; s.toPattern(source, TRUE); 1951 UnicodeString result; t.toPattern(result, TRUE); 1952 if (s != t) { 1953 errln((UnicodeString)"FAIL: " + message 1954 + "; source = " + source 1955 + "; result = " + result 1956 ); 1957 return FALSE; 1958 } else { 1959 logln((UnicodeString)"Ok: " + message 1960 + "; source = " + source 1961 + "; result = " + result 1962 ); 1963 } 1964 return TRUE; 1965 } 1966 1967 void 1968 UnicodeSetTest::expectContainment(const UnicodeString& pat, 1969 const UnicodeString& charsIn, 1970 const UnicodeString& charsOut) { 1971 UErrorCode ec = U_ZERO_ERROR; 1972 UnicodeSet set(pat, ec); 1973 if (U_FAILURE(ec)) { 1974 dataerrln((UnicodeString)"FAIL: pattern \"" + 1975 pat + "\" => " + u_errorName(ec)); 1976 return; 1977 } 1978 expectContainment(set, pat, charsIn, charsOut); 1979 } 1980 1981 void 1982 UnicodeSetTest::expectContainment(const UnicodeSet& set, 1983 const UnicodeString& charsIn, 1984 const UnicodeString& charsOut) { 1985 UnicodeString pat; 1986 set.toPattern(pat); 1987 expectContainment(set, pat, charsIn, charsOut); 1988 } 1989 1990 void 1991 UnicodeSetTest::expectContainment(const UnicodeSet& set, 1992 const UnicodeString& setName, 1993 const UnicodeString& charsIn, 1994 const UnicodeString& charsOut) { 1995 UnicodeString bad; 1996 UChar32 c; 1997 int32_t i; 1998 1999 for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) { 2000 c = charsIn.char32At(i); 2001 if (!set.contains(c)) { 2002 bad.append(c); 2003 } 2004 } 2005 if (bad.length() > 0) { 2006 errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) + 2007 ", expected containment of " + prettify(charsIn)); 2008 } else { 2009 logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn)); 2010 } 2011 2012 bad.truncate(0); 2013 for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) { 2014 c = charsOut.char32At(i); 2015 if (set.contains(c)) { 2016 bad.append(c); 2017 } 2018 } 2019 if (bad.length() > 0) { 2020 errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) + 2021 ", expected non-containment of " + prettify(charsOut)); 2022 } else { 2023 logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut)); 2024 } 2025 } 2026 2027 void 2028 UnicodeSetTest::expectPattern(UnicodeSet& set, 2029 const UnicodeString& pattern, 2030 const UnicodeString& expectedPairs){ 2031 UErrorCode status = U_ZERO_ERROR; 2032 set.applyPattern(pattern, status); 2033 if (U_FAILURE(status)) { 2034 errln(UnicodeString("FAIL: applyPattern(\"") + pattern + 2035 "\") failed"); 2036 return; 2037 } else { 2038 if (getPairs(set) != expectedPairs ) { 2039 errln(UnicodeString("FAIL: applyPattern(\"") + pattern + 2040 "\") => pairs \"" + 2041 escape(getPairs(set)) + "\", expected \"" + 2042 escape(expectedPairs) + "\""); 2043 } else { 2044 logln(UnicodeString("Ok: applyPattern(\"") + pattern + 2045 "\") => pairs \"" + 2046 escape(getPairs(set)) + "\""); 2047 } 2048 } 2049 // the result of calling set.toPattern(), which is the string representation of 2050 // this set(set), is passed to a UnicodeSet constructor, and tested that it 2051 // will produce another set that is equal to this one. 2052 UnicodeString temppattern; 2053 set.toPattern(temppattern); 2054 UnicodeSet *tempset=new UnicodeSet(temppattern, status); 2055 if (U_FAILURE(status)) { 2056 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern")); 2057 return; 2058 } 2059 if(*tempset != set || getPairs(*tempset) != getPairs(set)){ 2060 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" + 2061 escape(getPairs(set)) + "\"")); 2062 } else{ 2063 logln(UnicodeString("Ok: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\"")); 2064 } 2065 2066 delete tempset; 2067 2068 } 2069 2070 void 2071 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) { 2072 if (getPairs(set) != expectedPairs) { 2073 errln(UnicodeString("FAIL: Expected pair list \"") + 2074 escape(expectedPairs) + "\", got \"" + 2075 escape(getPairs(set)) + "\""); 2076 } 2077 } 2078 2079 void UnicodeSetTest::expectToPattern(const UnicodeSet& set, 2080 const UnicodeString& expPat, 2081 const char** expStrings) { 2082 UnicodeString pat; 2083 set.toPattern(pat, TRUE); 2084 if (pat == expPat) { 2085 logln((UnicodeString)"Ok: toPattern() => \"" + pat + "\""); 2086 } else { 2087 errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\""); 2088 return; 2089 } 2090 if (expStrings == NULL) { 2091 return; 2092 } 2093 UBool in = TRUE; 2094 for (int32_t i=0; expStrings[i] != NULL; ++i) { 2095 if (expStrings[i] == NOT) { // sic; pointer comparison 2096 in = FALSE; 2097 continue; 2098 } 2099 UnicodeString s = CharsToUnicodeString(expStrings[i]); 2100 UBool contained = set.contains(s); 2101 if (contained == in) { 2102 logln((UnicodeString)"Ok: " + expPat + 2103 (contained ? " contains {" : " does not contain {") + 2104 escape(expStrings[i]) + "}"); 2105 } else { 2106 errln((UnicodeString)"FAIL: " + expPat + 2107 (contained ? " contains {" : " does not contain {") + 2108 escape(expStrings[i]) + "}"); 2109 } 2110 } 2111 } 2112 2113 static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); } 2114 2115 void 2116 UnicodeSetTest::doAssert(UBool condition, const char *message) 2117 { 2118 if (!condition) { 2119 errln(UnicodeString("ERROR : ") + message); 2120 } 2121 } 2122 2123 UnicodeString 2124 UnicodeSetTest::escape(const UnicodeString& s) { 2125 UnicodeString buf; 2126 for (int32_t i=0; i<s.length(); ) 2127 { 2128 UChar32 c = s.char32At(i); 2129 if (0x0020 <= c && c <= 0x007F) { 2130 buf += c; 2131 } else { 2132 if (c <= 0xFFFF) { 2133 buf += (UChar)0x5c; buf += (UChar)0x75; 2134 } else { 2135 buf += (UChar)0x5c; buf += (UChar)0x55; 2136 buf += toHexString((c & 0xF0000000) >> 28); 2137 buf += toHexString((c & 0x0F000000) >> 24); 2138 buf += toHexString((c & 0x00F00000) >> 20); 2139 buf += toHexString((c & 0x000F0000) >> 16); 2140 } 2141 buf += toHexString((c & 0xF000) >> 12); 2142 buf += toHexString((c & 0x0F00) >> 8); 2143 buf += toHexString((c & 0x00F0) >> 4); 2144 buf += toHexString(c & 0x000F); 2145 } 2146 i += U16_LENGTH(c); 2147 } 2148 return buf; 2149 } 2150 2151 void UnicodeSetTest::TestFreezable() { 2152 UErrorCode errorCode=U_ZERO_ERROR; 2153 UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15); 2154 UnicodeSet idSet(idPattern, errorCode); 2155 if(U_FAILURE(errorCode)) { 2156 dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode)); 2157 return; 2158 } 2159 2160 UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15); 2161 UnicodeSet wsSet(wsPattern, errorCode); 2162 if(U_FAILURE(errorCode)) { 2163 dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode)); 2164 return; 2165 } 2166 2167 idSet.add(idPattern); 2168 UnicodeSet frozen(idSet); 2169 frozen.freeze(); 2170 2171 if(idSet.isFrozen() || !frozen.isFrozen()) { 2172 errln("FAIL: isFrozen() is wrong"); 2173 } 2174 if(frozen!=idSet || !(frozen==idSet)) { 2175 errln("FAIL: a copy-constructed frozen set differs from its original"); 2176 } 2177 2178 frozen=wsSet; 2179 if(frozen!=idSet || !(frozen==idSet)) { 2180 errln("FAIL: a frozen set was modified by operator="); 2181 } 2182 2183 UnicodeSet frozen2(frozen); 2184 if(frozen2!=frozen || frozen2!=idSet) { 2185 errln("FAIL: a copied frozen set differs from its frozen original"); 2186 } 2187 if(!frozen2.isFrozen()) { 2188 errln("FAIL: copy-constructing a frozen set results in a thawed one"); 2189 } 2190 UnicodeSet frozen3(5, 55); // Set to some values to really test assignment below, not copy construction. 2191 if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) { 2192 errln("FAIL: UnicodeSet(5, 55) failed"); 2193 } 2194 frozen3=frozen; 2195 if(!frozen3.isFrozen()) { 2196 errln("FAIL: copying a frozen set results in a thawed one"); 2197 } 2198 2199 UnicodeSet *cloned=(UnicodeSet *)frozen.clone(); 2200 if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) { 2201 errln("FAIL: clone() failed"); 2202 } 2203 cloned->add(0xd802, 0xd805); 2204 if(cloned->containsSome(0xd802, 0xd805)) { 2205 errln("FAIL: unable to modify clone"); 2206 } 2207 delete cloned; 2208 2209 UnicodeSet *thawed=(UnicodeSet *)frozen.cloneAsThawed(); 2210 if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) { 2211 errln("FAIL: cloneAsThawed() failed"); 2212 } 2213 thawed->add(0xd802, 0xd805); 2214 if(!thawed->contains(0xd802, 0xd805)) { 2215 errln("FAIL: unable to modify thawed clone"); 2216 } 2217 delete thawed; 2218 2219 frozen.set(5, 55); 2220 if(frozen!=idSet || !(frozen==idSet)) { 2221 errln("FAIL: UnicodeSet::set() modified a frozen set"); 2222 } 2223 2224 frozen.clear(); 2225 if(frozen!=idSet || !(frozen==idSet)) { 2226 errln("FAIL: UnicodeSet::clear() modified a frozen set"); 2227 } 2228 2229 frozen.closeOver(USET_CASE_INSENSITIVE); 2230 if(frozen!=idSet || !(frozen==idSet)) { 2231 errln("FAIL: UnicodeSet::closeOver() modified a frozen set"); 2232 } 2233 2234 frozen.compact(); 2235 if(frozen!=idSet || !(frozen==idSet)) { 2236 errln("FAIL: UnicodeSet::compact() modified a frozen set"); 2237 } 2238 2239 ParsePosition pos; 2240 frozen. 2241 applyPattern(wsPattern, errorCode). 2242 applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode). 2243 applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode). 2244 applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode). 2245 applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode); 2246 if(frozen!=idSet || !(frozen==idSet)) { 2247 errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set"); 2248 } 2249 2250 frozen. 2251 add(0xd800). 2252 add(0xd802, 0xd805). 2253 add(wsPattern). 2254 addAll(idPattern). 2255 addAll(wsSet); 2256 if(frozen!=idSet || !(frozen==idSet)) { 2257 errln("FAIL: UnicodeSet::addXYZ() modified a frozen set"); 2258 } 2259 2260 frozen. 2261 retain(0x62). 2262 retain(0x64, 0x69). 2263 retainAll(wsPattern). 2264 retainAll(wsSet); 2265 if(frozen!=idSet || !(frozen==idSet)) { 2266 errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set"); 2267 } 2268 2269 frozen. 2270 remove(0x62). 2271 remove(0x64, 0x69). 2272 remove(idPattern). 2273 removeAll(idPattern). 2274 removeAll(idSet); 2275 if(frozen!=idSet || !(frozen==idSet)) { 2276 errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set"); 2277 } 2278 2279 frozen. 2280 complement(). 2281 complement(0x62). 2282 complement(0x64, 0x69). 2283 complement(idPattern). 2284 complementAll(idPattern). 2285 complementAll(idSet); 2286 if(frozen!=idSet || !(frozen==idSet)) { 2287 errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set"); 2288 } 2289 } 2290 2291 // Test span() etc. -------------------------------------------------------- *** 2292 2293 // Append the UTF-8 version of the string to t and return the appended UTF-8 length. 2294 static int32_t 2295 appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) { 2296 UErrorCode errorCode=U_ZERO_ERROR; 2297 int32_t length8=0; 2298 u_strToUTF8(t, capacity, &length8, s, length, &errorCode); 2299 if(U_SUCCESS(errorCode)) { 2300 return length8; 2301 } else { 2302 // The string contains an unpaired surrogate. 2303 // Ignore this string. 2304 return 0; 2305 } 2306 } 2307 2308 class UnicodeSetWithStringsIterator; 2309 2310 // Make the strings in a UnicodeSet easily accessible. 2311 class UnicodeSetWithStrings { 2312 public: 2313 UnicodeSetWithStrings(const UnicodeSet &normalSet) : 2314 set(normalSet), stringsLength(0), hasSurrogates(FALSE) { 2315 int32_t size=set.size(); 2316 if(size>0 && set.charAt(size-1)<0) { 2317 // If a set's last element is not a code point, then it must contain strings. 2318 // Iterate over the set, skip all code point ranges, and cache the strings. 2319 // Convert them to UTF-8 for spanUTF8(). 2320 UnicodeSetIterator iter(set); 2321 const UnicodeString *s; 2322 char *s8=utf8; 2323 int32_t length8, utf8Count=0; 2324 while(iter.nextRange() && stringsLength<LENGTHOF(strings)) { 2325 if(iter.isString()) { 2326 // Store the pointer to the set's string element 2327 // which we happen to know is a stable pointer. 2328 strings[stringsLength]=s=&iter.getString(); 2329 utf8Count+= 2330 utf8Lengths[stringsLength]=length8= 2331 appendUTF8(s->getBuffer(), s->length(), 2332 s8, (int32_t)(sizeof(utf8)-utf8Count)); 2333 if(length8==0) { 2334 hasSurrogates=TRUE; // Contains unpaired surrogates. 2335 } 2336 s8+=length8; 2337 ++stringsLength; 2338 } 2339 } 2340 } 2341 } 2342 2343 const UnicodeSet &getSet() const { 2344 return set; 2345 } 2346 2347 UBool hasStrings() const { 2348 return (UBool)(stringsLength>0); 2349 } 2350 2351 UBool hasStringsWithSurrogates() const { 2352 return hasSurrogates; 2353 } 2354 2355 private: 2356 friend class UnicodeSetWithStringsIterator; 2357 2358 const UnicodeSet &set; 2359 2360 const UnicodeString *strings[20]; 2361 int32_t stringsLength; 2362 UBool hasSurrogates; 2363 2364 char utf8[1024]; 2365 int32_t utf8Lengths[20]; 2366 2367 int32_t nextStringIndex; 2368 int32_t nextUTF8Start; 2369 }; 2370 2371 class UnicodeSetWithStringsIterator { 2372 public: 2373 UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) : 2374 fSet(set), nextStringIndex(0), nextUTF8Start(0) { 2375 } 2376 2377 void reset() { 2378 nextStringIndex=nextUTF8Start=0; 2379 } 2380 2381 const UnicodeString *nextString() { 2382 if(nextStringIndex<fSet.stringsLength) { 2383 return fSet.strings[nextStringIndex++]; 2384 } else { 2385 return NULL; 2386 } 2387 } 2388 2389 // Do not mix with calls to nextString(). 2390 const char *nextUTF8(int32_t &length) { 2391 if(nextStringIndex<fSet.stringsLength) { 2392 const char *s8=fSet.utf8+nextUTF8Start; 2393 nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++]; 2394 return s8; 2395 } else { 2396 length=0; 2397 return NULL; 2398 } 2399 } 2400 2401 private: 2402 const UnicodeSetWithStrings &fSet; 2403 int32_t nextStringIndex; 2404 int32_t nextUTF8Start; 2405 }; 2406 2407 // Compare 16-bit Unicode strings (which may be malformed UTF-16) 2408 // at code point boundaries. 2409 // That is, each edge of a match must not be in the middle of a surrogate pair. 2410 static inline UBool 2411 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) { 2412 s+=start; 2413 limit-=start; 2414 int32_t length=t.length(); 2415 return 0==t.compare(s, length) && 2416 !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) && 2417 !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length])); 2418 } 2419 2420 // Implement span() with contains() for comparison. 2421 static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length, 2422 USetSpanCondition spanCondition) { 2423 const UnicodeSet &realSet(set.getSet()); 2424 if(!set.hasStrings()) { 2425 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2426 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2427 } 2428 2429 UChar32 c; 2430 int32_t start=0, prev; 2431 while((prev=start)<length) { 2432 U16_NEXT(s, start, length, c); 2433 if(realSet.contains(c)!=spanCondition) { 2434 break; 2435 } 2436 } 2437 return prev; 2438 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) { 2439 UnicodeSetWithStringsIterator iter(set); 2440 UChar32 c; 2441 int32_t start, next; 2442 for(start=next=0; start<length;) { 2443 U16_NEXT(s, next, length, c); 2444 if(realSet.contains(c)) { 2445 break; 2446 } 2447 const UnicodeString *str; 2448 iter.reset(); 2449 while((str=iter.nextString())!=NULL) { 2450 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) { 2451 // spanNeedsStrings=TRUE; 2452 return start; 2453 } 2454 } 2455 start=next; 2456 } 2457 return start; 2458 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ { 2459 UnicodeSetWithStringsIterator iter(set); 2460 UChar32 c; 2461 int32_t start, next, maxSpanLimit=0; 2462 for(start=next=0; start<length;) { 2463 U16_NEXT(s, next, length, c); 2464 if(!realSet.contains(c)) { 2465 next=start; // Do not span this single, not-contained code point. 2466 } 2467 const UnicodeString *str; 2468 iter.reset(); 2469 while((str=iter.nextString())!=NULL) { 2470 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) { 2471 // spanNeedsStrings=TRUE; 2472 int32_t matchLimit=start+str->length(); 2473 if(matchLimit==length) { 2474 return length; 2475 } 2476 if(spanCondition==USET_SPAN_CONTAINED) { 2477 // Iterate for the shortest match at each position. 2478 // Recurse for each but the shortest match. 2479 if(next==start) { 2480 next=matchLimit; // First match from start. 2481 } else { 2482 if(matchLimit<next) { 2483 // Remember shortest match from start for iteration. 2484 int32_t temp=next; 2485 next=matchLimit; 2486 matchLimit=temp; 2487 } 2488 // Recurse for non-shortest match from start. 2489 int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit, 2490 USET_SPAN_CONTAINED); 2491 if((matchLimit+spanLength)>maxSpanLimit) { 2492 maxSpanLimit=matchLimit+spanLength; 2493 if(maxSpanLimit==length) { 2494 return length; 2495 } 2496 } 2497 } 2498 } else /* spanCondition==USET_SPAN_SIMPLE */ { 2499 if(matchLimit>next) { 2500 // Remember longest match from start. 2501 next=matchLimit; 2502 } 2503 } 2504 } 2505 } 2506 if(next==start) { 2507 break; // No match from start. 2508 } 2509 start=next; 2510 } 2511 if(start>maxSpanLimit) { 2512 return start; 2513 } else { 2514 return maxSpanLimit; 2515 } 2516 } 2517 } 2518 2519 static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length, 2520 USetSpanCondition spanCondition) { 2521 if(length==0) { 2522 return 0; 2523 } 2524 const UnicodeSet &realSet(set.getSet()); 2525 if(!set.hasStrings()) { 2526 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2527 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2528 } 2529 2530 UChar32 c; 2531 int32_t prev=length; 2532 do { 2533 U16_PREV(s, 0, length, c); 2534 if(realSet.contains(c)!=spanCondition) { 2535 break; 2536 } 2537 } while((prev=length)>0); 2538 return prev; 2539 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) { 2540 UnicodeSetWithStringsIterator iter(set); 2541 UChar32 c; 2542 int32_t prev=length, length0=length; 2543 do { 2544 U16_PREV(s, 0, length, c); 2545 if(realSet.contains(c)) { 2546 break; 2547 } 2548 const UnicodeString *str; 2549 iter.reset(); 2550 while((str=iter.nextString())!=NULL) { 2551 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) { 2552 // spanNeedsStrings=TRUE; 2553 return prev; 2554 } 2555 } 2556 } while((prev=length)>0); 2557 return prev; 2558 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ { 2559 UnicodeSetWithStringsIterator iter(set); 2560 UChar32 c; 2561 int32_t prev=length, minSpanStart=length, length0=length; 2562 do { 2563 U16_PREV(s, 0, length, c); 2564 if(!realSet.contains(c)) { 2565 length=prev; // Do not span this single, not-contained code point. 2566 } 2567 const UnicodeString *str; 2568 iter.reset(); 2569 while((str=iter.nextString())!=NULL) { 2570 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) { 2571 // spanNeedsStrings=TRUE; 2572 int32_t matchStart=prev-str->length(); 2573 if(matchStart==0) { 2574 return 0; 2575 } 2576 if(spanCondition==USET_SPAN_CONTAINED) { 2577 // Iterate for the shortest match at each position. 2578 // Recurse for each but the shortest match. 2579 if(length==prev) { 2580 length=matchStart; // First match from prev. 2581 } else { 2582 if(matchStart>length) { 2583 // Remember shortest match from prev for iteration. 2584 int32_t temp=length; 2585 length=matchStart; 2586 matchStart=temp; 2587 } 2588 // Recurse for non-shortest match from prev. 2589 int32_t spanStart=containsSpanBackUTF16(set, s, matchStart, 2590 USET_SPAN_CONTAINED); 2591 if(spanStart<minSpanStart) { 2592 minSpanStart=spanStart; 2593 if(minSpanStart==0) { 2594 return 0; 2595 } 2596 } 2597 } 2598 } else /* spanCondition==USET_SPAN_SIMPLE */ { 2599 if(matchStart<length) { 2600 // Remember longest match from prev. 2601 length=matchStart; 2602 } 2603 } 2604 } 2605 } 2606 if(length==prev) { 2607 break; // No match from prev. 2608 } 2609 } while((prev=length)>0); 2610 if(prev<minSpanStart) { 2611 return prev; 2612 } else { 2613 return minSpanStart; 2614 } 2615 } 2616 } 2617 2618 static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length, 2619 USetSpanCondition spanCondition) { 2620 const UnicodeSet &realSet(set.getSet()); 2621 if(!set.hasStrings()) { 2622 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2623 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2624 } 2625 2626 UChar32 c; 2627 int32_t start=0, prev; 2628 while((prev=start)<length) { 2629 U8_NEXT(s, start, length, c); 2630 if(c<0) { 2631 c=0xfffd; 2632 } 2633 if(realSet.contains(c)!=spanCondition) { 2634 break; 2635 } 2636 } 2637 return prev; 2638 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) { 2639 UnicodeSetWithStringsIterator iter(set); 2640 UChar32 c; 2641 int32_t start, next; 2642 for(start=next=0; start<length;) { 2643 U8_NEXT(s, next, length, c); 2644 if(c<0) { 2645 c=0xfffd; 2646 } 2647 if(realSet.contains(c)) { 2648 break; 2649 } 2650 const char *s8; 2651 int32_t length8; 2652 iter.reset(); 2653 while((s8=iter.nextUTF8(length8))!=NULL) { 2654 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) { 2655 // spanNeedsStrings=TRUE; 2656 return start; 2657 } 2658 } 2659 start=next; 2660 } 2661 return start; 2662 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ { 2663 UnicodeSetWithStringsIterator iter(set); 2664 UChar32 c; 2665 int32_t start, next, maxSpanLimit=0; 2666 for(start=next=0; start<length;) { 2667 U8_NEXT(s, next, length, c); 2668 if(c<0) { 2669 c=0xfffd; 2670 } 2671 if(!realSet.contains(c)) { 2672 next=start; // Do not span this single, not-contained code point. 2673 } 2674 const char *s8; 2675 int32_t length8; 2676 iter.reset(); 2677 while((s8=iter.nextUTF8(length8))!=NULL) { 2678 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) { 2679 // spanNeedsStrings=TRUE; 2680 int32_t matchLimit=start+length8; 2681 if(matchLimit==length) { 2682 return length; 2683 } 2684 if(spanCondition==USET_SPAN_CONTAINED) { 2685 // Iterate for the shortest match at each position. 2686 // Recurse for each but the shortest match. 2687 if(next==start) { 2688 next=matchLimit; // First match from start. 2689 } else { 2690 if(matchLimit<next) { 2691 // Remember shortest match from start for iteration. 2692 int32_t temp=next; 2693 next=matchLimit; 2694 matchLimit=temp; 2695 } 2696 // Recurse for non-shortest match from start. 2697 int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit, 2698 USET_SPAN_CONTAINED); 2699 if((matchLimit+spanLength)>maxSpanLimit) { 2700 maxSpanLimit=matchLimit+spanLength; 2701 if(maxSpanLimit==length) { 2702 return length; 2703 } 2704 } 2705 } 2706 } else /* spanCondition==USET_SPAN_SIMPLE */ { 2707 if(matchLimit>next) { 2708 // Remember longest match from start. 2709 next=matchLimit; 2710 } 2711 } 2712 } 2713 } 2714 if(next==start) { 2715 break; // No match from start. 2716 } 2717 start=next; 2718 } 2719 if(start>maxSpanLimit) { 2720 return start; 2721 } else { 2722 return maxSpanLimit; 2723 } 2724 } 2725 } 2726 2727 static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length, 2728 USetSpanCondition spanCondition) { 2729 if(length==0) { 2730 return 0; 2731 } 2732 const UnicodeSet &realSet(set.getSet()); 2733 if(!set.hasStrings()) { 2734 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2735 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2736 } 2737 2738 UChar32 c; 2739 int32_t prev=length; 2740 do { 2741 U8_PREV(s, 0, length, c); 2742 if(c<0) { 2743 c=0xfffd; 2744 } 2745 if(realSet.contains(c)!=spanCondition) { 2746 break; 2747 } 2748 } while((prev=length)>0); 2749 return prev; 2750 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) { 2751 UnicodeSetWithStringsIterator iter(set); 2752 UChar32 c; 2753 int32_t prev=length; 2754 do { 2755 U8_PREV(s, 0, length, c); 2756 if(c<0) { 2757 c=0xfffd; 2758 } 2759 if(realSet.contains(c)) { 2760 break; 2761 } 2762 const char *s8; 2763 int32_t length8; 2764 iter.reset(); 2765 while((s8=iter.nextUTF8(length8))!=NULL) { 2766 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) { 2767 // spanNeedsStrings=TRUE; 2768 return prev; 2769 } 2770 } 2771 } while((prev=length)>0); 2772 return prev; 2773 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ { 2774 UnicodeSetWithStringsIterator iter(set); 2775 UChar32 c; 2776 int32_t prev=length, minSpanStart=length; 2777 do { 2778 U8_PREV(s, 0, length, c); 2779 if(c<0) { 2780 c=0xfffd; 2781 } 2782 if(!realSet.contains(c)) { 2783 length=prev; // Do not span this single, not-contained code point. 2784 } 2785 const char *s8; 2786 int32_t length8; 2787 iter.reset(); 2788 while((s8=iter.nextUTF8(length8))!=NULL) { 2789 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) { 2790 // spanNeedsStrings=TRUE; 2791 int32_t matchStart=prev-length8; 2792 if(matchStart==0) { 2793 return 0; 2794 } 2795 if(spanCondition==USET_SPAN_CONTAINED) { 2796 // Iterate for the shortest match at each position. 2797 // Recurse for each but the shortest match. 2798 if(length==prev) { 2799 length=matchStart; // First match from prev. 2800 } else { 2801 if(matchStart>length) { 2802 // Remember shortest match from prev for iteration. 2803 int32_t temp=length; 2804 length=matchStart; 2805 matchStart=temp; 2806 } 2807 // Recurse for non-shortest match from prev. 2808 int32_t spanStart=containsSpanBackUTF8(set, s, matchStart, 2809 USET_SPAN_CONTAINED); 2810 if(spanStart<minSpanStart) { 2811 minSpanStart=spanStart; 2812 if(minSpanStart==0) { 2813 return 0; 2814 } 2815 } 2816 } 2817 } else /* spanCondition==USET_SPAN_SIMPLE */ { 2818 if(matchStart<length) { 2819 // Remember longest match from prev. 2820 length=matchStart; 2821 } 2822 } 2823 } 2824 } 2825 if(length==prev) { 2826 break; // No match from prev. 2827 } 2828 } while((prev=length)>0); 2829 if(prev<minSpanStart) { 2830 return prev; 2831 } else { 2832 return minSpanStart; 2833 } 2834 } 2835 } 2836 2837 // spans to be performed and compared 2838 enum { 2839 SPAN_UTF16 =1, 2840 SPAN_UTF8 =2, 2841 SPAN_UTFS =3, 2842 2843 SPAN_SET =4, 2844 SPAN_COMPLEMENT =8, 2845 SPAN_POLARITY =0xc, 2846 2847 SPAN_FWD =0x10, 2848 SPAN_BACK =0x20, 2849 SPAN_DIRS =0x30, 2850 2851 SPAN_CONTAINED =0x100, 2852 SPAN_SIMPLE =0x200, 2853 SPAN_CONDITION =0x300, 2854 2855 SPAN_ALL =0x33f 2856 }; 2857 2858 static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) { 2859 return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED; 2860 } 2861 2862 static inline int32_t slen(const void *s, UBool isUTF16) { 2863 return isUTF16 ? u_strlen((const UChar *)s) : strlen((const char *)s); 2864 } 2865 2866 /* 2867 * Count spans on a string with the method according to type and set the span limits. 2868 * The set may be the complement of the original. 2869 * When using spanBack() and comparing with span(), use a span condition for the first spanBack() 2870 * according to the expected number of spans. 2871 * Sets typeName to an empty string if there is no such type. 2872 * Returns -1 if the span option is filtered out. 2873 */ 2874 static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement, 2875 const void *s, int32_t length, UBool isUTF16, 2876 uint32_t whichSpans, 2877 int type, const char *&typeName, 2878 int32_t limits[], int32_t limitsCapacity, 2879 int32_t expectCount) { 2880 const UnicodeSet &realSet(set.getSet()); 2881 int32_t start, count; 2882 USetSpanCondition spanCondition, firstSpanCondition, contained; 2883 UBool isForward; 2884 2885 if(type<0 || 7<type) { 2886 typeName=""; 2887 return 0; 2888 } 2889 2890 static const char *const typeNames16[]={ 2891 "contains", "contains(LM)", 2892 "span", "span(LM)", 2893 "containsBack", "containsBack(LM)", 2894 "spanBack", "spanBack(LM)" 2895 }; 2896 2897 static const char *const typeNames8[]={ 2898 "containsUTF8", "containsUTF8(LM)", 2899 "spanUTF8", "spanUTF8(LM)", 2900 "containsBackUTF8", "containsBackUTF8(LM)", // not implemented 2901 "spanBackUTF8", "spanBackUTF8(LM)" 2902 }; 2903 2904 typeName= isUTF16 ? typeNames16[type] : typeNames8[type]; 2905 2906 // filter span options 2907 if(type<=3) { 2908 // span forward 2909 if((whichSpans&SPAN_FWD)==0) { 2910 return -1; 2911 } 2912 isForward=TRUE; 2913 } else { 2914 // span backward 2915 if((whichSpans&SPAN_BACK)==0) { 2916 return -1; 2917 } 2918 isForward=FALSE; 2919 } 2920 if((type&1)==0) { 2921 // use USET_SPAN_CONTAINED 2922 if((whichSpans&SPAN_CONTAINED)==0) { 2923 return -1; 2924 } 2925 contained=USET_SPAN_CONTAINED; 2926 } else { 2927 // use USET_SPAN_SIMPLE 2928 if((whichSpans&SPAN_SIMPLE)==0) { 2929 return -1; 2930 } 2931 contained=USET_SPAN_SIMPLE; 2932 } 2933 2934 // Default first span condition for going forward with an uncomplemented set. 2935 spanCondition=USET_SPAN_NOT_CONTAINED; 2936 if(isComplement) { 2937 spanCondition=invertSpanCondition(spanCondition, contained); 2938 } 2939 2940 // First span condition for span(), used to terminate the spanBack() iteration. 2941 firstSpanCondition=spanCondition; 2942 2943 // spanBack(): Its initial span condition is span()'s last span condition, 2944 // which is the opposite of span()'s first span condition 2945 // if we expect an even number of spans. 2946 // (The loop inverts spanCondition (expectCount-1) times 2947 // before the expectCount'th span() call.) 2948 // If we do not compare forward and backward directions, then we do not have an 2949 // expectCount and just start with firstSpanCondition. 2950 if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) { 2951 spanCondition=invertSpanCondition(spanCondition, contained); 2952 } 2953 2954 count=0; 2955 switch(type) { 2956 case 0: 2957 case 1: 2958 start=0; 2959 if(length<0) { 2960 length=slen(s, isUTF16); 2961 } 2962 for(;;) { 2963 start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) : 2964 containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition); 2965 if(count<limitsCapacity) { 2966 limits[count]=start; 2967 } 2968 ++count; 2969 if(start>=length) { 2970 break; 2971 } 2972 spanCondition=invertSpanCondition(spanCondition, contained); 2973 } 2974 break; 2975 case 2: 2976 case 3: 2977 start=0; 2978 for(;;) { 2979 start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) : 2980 realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition); 2981 if(count<limitsCapacity) { 2982 limits[count]=start; 2983 } 2984 ++count; 2985 if(length>=0 ? start>=length : 2986 isUTF16 ? ((const UChar *)s)[start]==0 : 2987 ((const char *)s)[start]==0 2988 ) { 2989 break; 2990 } 2991 spanCondition=invertSpanCondition(spanCondition, contained); 2992 } 2993 break; 2994 case 4: 2995 case 5: 2996 if(length<0) { 2997 length=slen(s, isUTF16); 2998 } 2999 for(;;) { 3000 ++count; 3001 if(count<=limitsCapacity) { 3002 limits[limitsCapacity-count]=length; 3003 } 3004 length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) : 3005 containsSpanBackUTF8(set, (const char *)s, length, spanCondition); 3006 if(length==0 && spanCondition==firstSpanCondition) { 3007 break; 3008 } 3009 spanCondition=invertSpanCondition(spanCondition, contained); 3010 } 3011 if(count<limitsCapacity) { 3012 memmove(limits, limits+(limitsCapacity-count), count*4); 3013 } 3014 break; 3015 case 6: 3016 case 7: 3017 for(;;) { 3018 ++count; 3019 if(count<=limitsCapacity) { 3020 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16); 3021 } 3022 // Note: Length<0 is tested only for the first spanBack(). 3023 // If we wanted to keep length<0 for all spanBack()s, we would have to 3024 // temporarily modify the string by placing a NUL where the previous spanBack() stopped. 3025 length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) : 3026 realSet.spanBackUTF8((const char *)s, length, spanCondition); 3027 if(length==0 && spanCondition==firstSpanCondition) { 3028 break; 3029 } 3030 spanCondition=invertSpanCondition(spanCondition, contained); 3031 } 3032 if(count<limitsCapacity) { 3033 memmove(limits, limits+(limitsCapacity-count), count*4); 3034 } 3035 break; 3036 default: 3037 typeName=""; 3038 return -1; 3039 } 3040 3041 return count; 3042 } 3043 3044 // sets to be tested; odd index=isComplement 3045 enum { 3046 SLOW, 3047 SLOW_NOT, 3048 FAST, 3049 FAST_NOT, 3050 SET_COUNT 3051 }; 3052 3053 static const char *const setNames[SET_COUNT]={ 3054 "slow", 3055 "slow.not", 3056 "fast", 3057 "fast.not" 3058 }; 3059 3060 /* 3061 * Verify that we get the same results whether we look at text with contains(), 3062 * span() or spanBack(), using unfrozen or frozen versions of the set, 3063 * and using the set or its complement (switching the spanConditions accordingly). 3064 * The latter verifies that 3065 * set.span(spanCondition) == set.complement().span(!spanCondition). 3066 * 3067 * The expectLimits[] are either provided by the caller (with expectCount>=0) 3068 * or returned to the caller (with an input expectCount<0). 3069 */ 3070 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4], 3071 const void *s, int32_t length, UBool isUTF16, 3072 uint32_t whichSpans, 3073 int32_t expectLimits[], int32_t &expectCount, 3074 const char *testName, int32_t index) { 3075 int32_t limits[500]; 3076 int32_t limitsCount; 3077 int i, j; 3078 3079 const char *typeName; 3080 int type; 3081 3082 for(i=0; i<SET_COUNT; ++i) { 3083 if((i&1)==0) { 3084 // Even-numbered sets are original, uncomplemented sets. 3085 if((whichSpans&SPAN_SET)==0) { 3086 continue; 3087 } 3088 } else { 3089 // Odd-numbered sets are complemented. 3090 if((whichSpans&SPAN_COMPLEMENT)==0) { 3091 continue; 3092 } 3093 } 3094 for(type=0;; ++type) { 3095 limitsCount=getSpans(*sets[i], (UBool)(i&1), 3096 s, length, isUTF16, 3097 whichSpans, 3098 type, typeName, 3099 limits, LENGTHOF(limits), expectCount); 3100 if(typeName[0]==0) { 3101 break; // All types tried. 3102 } 3103 if(limitsCount<0) { 3104 continue; // Span option filtered out. 3105 } 3106 if(expectCount<0) { 3107 expectCount=limitsCount; 3108 if(limitsCount>LENGTHOF(limits)) { 3109 errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans", 3110 testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)LENGTHOF(limits)); 3111 return; 3112 } 3113 memcpy(expectLimits, limits, limitsCount*4); 3114 } else if(limitsCount!=expectCount) { 3115 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld", 3116 testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount); 3117 } else { 3118 for(j=0; j<limitsCount; ++j) { 3119 if(limits[j]!=expectLimits[j]) { 3120 errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld", 3121 testName, (long)index, setNames[i], typeName, (long)limitsCount, 3122 j, (long)limits[j], (long)expectLimits[j]); 3123 break; 3124 } 3125 } 3126 } 3127 } 3128 } 3129 3130 // Compare span() with containsAll()/containsNone(), 3131 // but only if we have expectLimits[] from the uncomplemented set. 3132 if(isUTF16 && (whichSpans&SPAN_SET)!=0) { 3133 const UChar *s16=(const UChar *)s; 3134 UnicodeString string; 3135 int32_t prev=0, limit, length; 3136 for(i=0; i<expectCount; ++i) { 3137 limit=expectLimits[i]; 3138 length=limit-prev; 3139 if(length>0) { 3140 string.setTo(FALSE, s16+prev, length); // read-only alias 3141 if(i&1) { 3142 if(!sets[SLOW]->getSet().containsAll(string)) { 3143 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()", 3144 testName, (long)index, setNames[SLOW], (long)prev, (long)limit); 3145 return; 3146 } 3147 if(!sets[FAST]->getSet().containsAll(string)) { 3148 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()", 3149 testName, (long)index, setNames[FAST], (long)prev, (long)limit); 3150 return; 3151 } 3152 } else { 3153 if(!sets[SLOW]->getSet().containsNone(string)) { 3154 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()", 3155 testName, (long)index, setNames[SLOW], (long)prev, (long)limit); 3156 return; 3157 } 3158 if(!sets[FAST]->getSet().containsNone(string)) { 3159 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()", 3160 testName, (long)index, setNames[FAST], (long)prev, (long)limit); 3161 return; 3162 } 3163 } 3164 } 3165 prev=limit; 3166 } 3167 } 3168 } 3169 3170 // Specifically test either UTF-16 or UTF-8. 3171 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4], 3172 const void *s, int32_t length, UBool isUTF16, 3173 uint32_t whichSpans, 3174 const char *testName, int32_t index) { 3175 int32_t expectLimits[500]; 3176 int32_t expectCount=-1; 3177 testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index); 3178 } 3179 3180 UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) { 3181 UChar c, c2; 3182 3183 if(length>=0) { 3184 while(length>0) { 3185 c=*s++; 3186 --length; 3187 if(0xd800<=c && c<0xe000) { 3188 if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) { 3189 return TRUE; 3190 } 3191 --length; 3192 } 3193 } 3194 } else { 3195 while((c=*s++)!=0) { 3196 if(0xd800<=c && c<0xe000) { 3197 if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) { 3198 return TRUE; 3199 } 3200 } 3201 } 3202 } 3203 return FALSE; 3204 } 3205 3206 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text, 3207 // unless either UTF is turned off in whichSpans. 3208 // Testing UTF-16 and UTF-8 together requires that surrogate code points 3209 // have the same contains(c) value as U+FFFD. 3210 void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4], 3211 const UChar *s16, int32_t length16, 3212 uint32_t whichSpans, 3213 const char *testName, int32_t index) { 3214 int32_t expectLimits[500]; 3215 int32_t expectCount; 3216 3217 expectCount=-1; // Get expectLimits[] from testSpan(). 3218 3219 if((whichSpans&SPAN_UTF16)!=0) { 3220 testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index); 3221 } 3222 if((whichSpans&SPAN_UTF8)==0) { 3223 return; 3224 } 3225 3226 // Convert s16[] and expectLimits[] to UTF-8. 3227 uint8_t s8[3000]; 3228 int32_t offsets[3000]; 3229 3230 const UChar *s16Limit=s16+length16; 3231 char *t=(char *)s8; 3232 char *tLimit=t+sizeof(s8); 3233 int32_t *o=offsets; 3234 UErrorCode errorCode=U_ZERO_ERROR; 3235 3236 // Convert with substitution: Turn unpaired surrogates into U+FFFD. 3237 ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode); 3238 if(U_FAILURE(errorCode)) { 3239 errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s", 3240 testName, (long)index, u_errorName(errorCode)); 3241 ucnv_resetFromUnicode(utf8Cnv); 3242 return; 3243 } 3244 int32_t length8=(int32_t)(t-(char *)s8); 3245 3246 // Convert expectLimits[]. 3247 int32_t i, j, expect; 3248 for(i=j=0; i<expectCount; ++i) { 3249 expect=expectLimits[i]; 3250 if(expect==length16) { 3251 expectLimits[i]=length8; 3252 } else { 3253 while(offsets[j]<expect) { 3254 ++j; 3255 } 3256 expectLimits[i]=j; 3257 } 3258 } 3259 3260 testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index); 3261 } 3262 3263 static UChar32 nextCodePoint(UChar32 c) { 3264 // Skip some large and boring ranges. 3265 switch(c) { 3266 case 0x3441: 3267 return 0x4d7f; 3268 case 0x5100: 3269 return 0x9f00; 3270 case 0xb040: 3271 return 0xd780; 3272 case 0xe041: 3273 return 0xf8fe; 3274 case 0x10100: 3275 return 0x20000; 3276 case 0x20041: 3277 return 0xe0000; 3278 case 0xe0101: 3279 return 0x10fffd; 3280 default: 3281 return c+1; 3282 } 3283 } 3284 3285 // Verify that all implementations represent the same set. 3286 void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) { 3287 // contains(U+FFFD) is inconsistent with contains(some surrogates), 3288 // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8: 3289 // Skip the UTF-8 part of the test - if the string contains surrogates - 3290 // because it is likely to produce a different result. 3291 UBool inconsistentSurrogates= 3292 (!(sets[0]->getSet().contains(0xfffd) ? 3293 sets[0]->getSet().contains(0xd800, 0xdfff) : 3294 sets[0]->getSet().containsNone(0xd800, 0xdfff)) || 3295 sets[0]->hasStringsWithSurrogates()); 3296 3297 UChar s[1000]; 3298 int32_t length=0; 3299 uint32_t localWhichSpans; 3300 3301 UChar32 c, first; 3302 for(first=c=0;; c=nextCodePoint(c)) { 3303 if(c>0x10ffff || length>(LENGTHOF(s)-U16_MAX_LENGTH)) { 3304 localWhichSpans=whichSpans; 3305 if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) { 3306 localWhichSpans&=~SPAN_UTF8; 3307 } 3308 testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first); 3309 if(c>0x10ffff) { 3310 break; 3311 } 3312 length=0; 3313 first=c; 3314 } 3315 U16_APPEND_UNSAFE(s, length, c); 3316 } 3317 } 3318 3319 // Test with a particular, interesting string. 3320 // Specify length and try NUL-termination. 3321 void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) { 3322 static const UChar s[]={ 3323 0x61, 0x62, 0x20, // Latin, space 3324 0x3b1, 0x3b2, 0x3b3, // Greek 3325 0xd900, // lead surrogate 3326 0x3000, 0x30ab, 0x30ad, // wide space, Katakana 3327 0xdc05, // trail surrogate 3328 0xa0, 0xac00, 0xd7a3, // nbsp, Hangul 3329 0xd900, 0xdc05, // unassigned supplementary 3330 0xd840, 0xdfff, 0xd860, 0xdffe, // Han supplementary 3331 0xd7a4, 0xdc05, 0xd900, 0x2028, // unassigned, surrogates in wrong order, LS 3332 0 // NUL 3333 }; 3334 3335 if((whichSpans&SPAN_UTF16)==0) { 3336 return; 3337 } 3338 testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0); 3339 testSpan(sets, s, LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1); 3340 } 3341 3342 void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) { 3343 static const char s[]={ 3344 "abc" // Latin 3345 3346 /* trail byte in lead position */ 3347 "\x80" 3348 3349 " " // space 3350 3351 /* truncated multi-byte sequences */ 3352 "\xd0" 3353 "\xe0" 3354 "\xe1" 3355 "\xed" 3356 "\xee" 3357 "\xf0" 3358 "\xf1" 3359 "\xf4" 3360 "\xf8" 3361 "\xfc" 3362 3363 "\xCE\xB1\xCE\xB2\xCE\xB3" // Greek 3364 3365 /* trail byte in lead position */ 3366 "\x80" 3367 3368 "\xe0\x80" 3369 "\xe0\xa0" 3370 "\xe1\x80" 3371 "\xed\x80" 3372 "\xed\xa0" 3373 "\xee\x80" 3374 "\xf0\x80" 3375 "\xf0\x90" 3376 "\xf1\x80" 3377 "\xf4\x80" 3378 "\xf4\x90" 3379 "\xf8\x80" 3380 "\xfc\x80" 3381 3382 "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD" // wide space, Katakana 3383 3384 /* trail byte in lead position */ 3385 "\x80" 3386 3387 "\xf0\x80\x80" 3388 "\xf0\x90\x80" 3389 "\xf1\x80\x80" 3390 "\xf4\x80\x80" 3391 "\xf4\x90\x80" 3392 "\xf8\x80\x80" 3393 "\xfc\x80\x80" 3394 3395 "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3" // nbsp, Hangul 3396 3397 /* trail byte in lead position */ 3398 "\x80" 3399 3400 "\xf8\x80\x80\x80" 3401 "\xfc\x80\x80\x80" 3402 3403 "\xF1\x90\x80\x85" // unassigned supplementary 3404 3405 /* trail byte in lead position */ 3406 "\x80" 3407 3408 "\xfc\x80\x80\x80\x80" 3409 3410 "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE" // Han supplementary 3411 3412 /* trail byte in lead position */ 3413 "\x80" 3414 3415 /* complete sequences but non-shortest forms or out of range etc. */ 3416 "\xc0\x80" 3417 "\xe0\x80\x80" 3418 "\xed\xa0\x80" 3419 "\xf0\x80\x80\x80" 3420 "\xf4\x90\x80\x80" 3421 "\xf8\x80\x80\x80\x80" 3422 "\xfc\x80\x80\x80\x80\x80" 3423 "\xfe" 3424 "\xff" 3425 3426 /* trail byte in lead position */ 3427 "\x80" 3428 3429 "\xED\x9E\xA4\xE2\x80\xA8" // unassigned, LS, NUL-terminated 3430 }; 3431 3432 if((whichSpans&SPAN_UTF8)==0) { 3433 return; 3434 } 3435 testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0); 3436 testSpan(sets, s, LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1); 3437 } 3438 3439 // Take a set of span options and multiply them so that 3440 // each portion only has one of the options a, b and c. 3441 // If b==0, then the set of options is just modified with mask and a. 3442 // If b!=0 and c==0, then the set of options is just modified with mask, a and b. 3443 static int32_t 3444 addAlternative(uint32_t whichSpans[], int32_t whichSpansCount, 3445 uint32_t mask, uint32_t a, uint32_t b, uint32_t c) { 3446 uint32_t s; 3447 int32_t i; 3448 3449 for(i=0; i<whichSpansCount; ++i) { 3450 s=whichSpans[i]&mask; 3451 whichSpans[i]=s|a; 3452 if(b!=0) { 3453 whichSpans[whichSpansCount+i]=s|b; 3454 if(c!=0) { 3455 whichSpans[2*whichSpansCount+i]=s|c; 3456 } 3457 } 3458 } 3459 return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount; 3460 } 3461 3462 #define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" 3463 #define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" 3464 #define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" 3465 #define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" 3466 3467 void UnicodeSetTest::TestSpan() { 3468 // "[...]" is a UnicodeSet pattern. 3469 // "*" performs tests on all Unicode code points and on a selection of 3470 // malformed UTF-8/16 strings. 3471 // "-options" limits the scope of testing for the current set. 3472 // By default, the test verifies that equivalent boundaries are found 3473 // for UTF-16 and UTF-8, going forward and backward, 3474 // alternating USET_SPAN_NOT_CONTAINED with 3475 // either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE. 3476 // Single-character options: 3477 // 8 -- UTF-16 and UTF-8 boundaries may differ. 3478 // Cause: contains(U+FFFD) is inconsistent with contains(some surrogates), 3479 // or the set contains strings with unpaired surrogates 3480 // which do not translate to valid UTF-8. 3481 // c -- set.span() and set.complement().span() boundaries may differ. 3482 // Cause: Set strings are not complemented. 3483 // b -- span() and spanBack() boundaries may differ. 3484 // Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED) 3485 // and spanBack(USET_SPAN_SIMPLE) are defined to 3486 // match with non-overlapping substrings. 3487 // For example, with a set containing "ab" and "ba", 3488 // span() of "aba" yields boundaries { 0, 2, 3 } 3489 // because the initial "ab" matches from 0 to 2, 3490 // while spanBack() yields boundaries { 0, 1, 3 } 3491 // because the final "ba" matches from 1 to 3. 3492 // l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ. 3493 // Cause: Strings in the set overlap, and a longer match may 3494 // require a sequence including non-longest substrings. 3495 // For example, with a set containing "ab", "abc" and "cd", 3496 // span(contained) of "abcd" spans the entire string 3497 // but span(longest match) only spans the first 3 characters. 3498 // Each "-options" first resets all options and then applies the specified options. 3499 // A "-" without options resets the options. 3500 // The options are also reset for each new set. 3501 // Other strings will be spanned. 3502 static const char *const testdata[]={ 3503 "[:ID_Continue:]", 3504 "*", 3505 "[:White_Space:]", 3506 "*", 3507 "[]", 3508 "*", 3509 "[\\u0000-\\U0010FFFF]", 3510 "*", 3511 "[\\u0000\\u0080\\u0800\\U00010000]", 3512 "*", 3513 "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]", 3514 "*", 3515 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]", 3516 "-c", 3517 "*", 3518 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]", 3519 "-c", 3520 "*", 3521 3522 // Overlapping strings cause overlapping attempts to match. 3523 "[x{xy}{xya}{axy}{ax}]", 3524 "-cl", 3525 3526 // More repetitions of "xya" would take too long with the recursive 3527 // reference implementation. 3528 // containsAll()=FALSE 3529 // test_string 0x14 3530 "xx" 3531 "xyaxyaxyaxya" // set.complement().span(longest match) will stop here. 3532 "xx" // set.complement().span(contained) will stop between the two 'x'es. 3533 "xyaxyaxyaxya" 3534 "xx" 3535 "xyaxyaxyaxya" // span() ends here. 3536 "aaa", 3537 3538 // containsAll()=TRUE 3539 // test_string 0x15 3540 "xx" 3541 "xyaxyaxyaxya" 3542 "xx" 3543 "xyaxyaxyaxya" 3544 "xx" 3545 "xyaxyaxyaxy", 3546 3547 "-bc", 3548 // test_string 0x17 3549 "byayaxya", // span() -> { 4, 7, 8 } spanBack() -> { 5, 8 } 3550 "-c", 3551 "byayaxy", // span() -> { 4, 7 } complement.span() -> { 7 } 3552 "byayax", // span() -> { 4, 6 } complement.span() -> { 6 } 3553 "-", 3554 "byaya", // span() -> { 5 } 3555 "byay", // span() -> { 4 } 3556 "bya", // span() -> { 3 } 3557 3558 // span(longest match) will not span the whole string. 3559 "[a{ab}{bc}]", 3560 "-cl", 3561 // test_string 0x21 3562 "abc", 3563 3564 "[a{ab}{abc}{cd}]", 3565 "-cl", 3566 "acdabcdabccd", 3567 3568 // spanBack(longest match) will not span the whole string. 3569 "[c{ab}{bc}]", 3570 "-cl", 3571 "abc", 3572 3573 "[d{cd}{bcd}{ab}]", 3574 "-cl", 3575 "abbcdabcdabd", 3576 3577 // Test with non-ASCII set strings - test proper handling of surrogate pairs 3578 // and UTF-8 trail bytes. 3579 // Copies of above test sets and strings, but transliterated to have 3580 // different code points with similar trail units. 3581 // Previous: a b c d 3582 // Unicode: 042B 30AB 200AB 204AB 3583 // UTF-16: 042B 30AB D840 DCAB D841 DCAB 3584 // UTF-8: D0 AB E3 82 AB F0 A0 82 AB F0 A0 92 AB 3585 "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]", 3586 "-cl", 3587 "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB", 3588 3589 "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]", 3590 "-cl", 3591 "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB", 3592 3593 // Stress bookkeeping and recursion. 3594 // The following strings are barely doable with the recursive 3595 // reference implementation. 3596 // The not-contained character at the end prevents an early exit from the span(). 3597 "[b{bb}]", 3598 "-c", 3599 // test_string 0x33 3600 "bbbbbbbbbbbbbbbbbbbbbbbb-", 3601 // On complement sets, span() and spanBack() get different results 3602 // because b is not in the complement set and there is an odd number of b's 3603 // in the test string. 3604 "-bc", 3605 "bbbbbbbbbbbbbbbbbbbbbbbbb-", 3606 3607 // Test with set strings with an initial or final code point span 3608 // longer than 254. 3609 "[a{" _64_a _64_a _64_a _64_a "b}" 3610 "{a" _64_b _64_b _64_b _64_b "}]", 3611 "-c", 3612 _64_a _64_a _64_a _63_a "b", 3613 _64_a _64_a _64_a _64_a "b", 3614 _64_a _64_a _64_a _64_a "aaaabbbb", 3615 "a" _64_b _64_b _64_b _63_b, 3616 "a" _64_b _64_b _64_b _64_b, 3617 "aaaabbbb" _64_b _64_b _64_b _64_b, 3618 3619 // Test with strings containing unpaired surrogates. 3620 // They are not representable in UTF-8, and a leading trail surrogate 3621 // and a trailing lead surrogate must not match in the middle of a proper surrogate pair. 3622 // U+20001 == \\uD840\\uDC01 3623 // U+20400 == \\uD841\\uDC00 3624 "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]", 3625 "-8cl", 3626 "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb" 3627 }; 3628 uint32_t whichSpans[96]={ SPAN_ALL }; 3629 int32_t whichSpansCount=1; 3630 3631 UnicodeSet *sets[SET_COUNT]={ NULL }; 3632 const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL }; 3633 3634 char testName[1024]; 3635 char *testNameLimit=testName; 3636 3637 int32_t i, j; 3638 for(i=0; i<LENGTHOF(testdata); ++i) { 3639 const char *s=testdata[i]; 3640 if(s[0]=='[') { 3641 // Create new test sets from this pattern. 3642 for(j=0; j<SET_COUNT; ++j) { 3643 delete sets_with_str[j]; 3644 delete sets[j]; 3645 } 3646 UErrorCode errorCode=U_ZERO_ERROR; 3647 sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode); 3648 if(U_FAILURE(errorCode)) { 3649 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode)); 3650 break; 3651 } 3652 sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]); 3653 sets[SLOW_NOT]->complement(); 3654 // Intermediate set: Test cloning of a frozen set. 3655 UnicodeSet *fast=new UnicodeSet(*sets[SLOW]); 3656 fast->freeze(); 3657 sets[FAST]=(UnicodeSet *)fast->clone(); 3658 delete fast; 3659 UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]); 3660 fastNot->freeze(); 3661 sets[FAST_NOT]=(UnicodeSet *)fastNot->clone(); 3662 delete fastNot; 3663 3664 for(j=0; j<SET_COUNT; ++j) { 3665 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]); 3666 } 3667 3668 strcpy(testName, s); 3669 testNameLimit=strchr(testName, 0); 3670 *testNameLimit++=':'; 3671 *testNameLimit=0; 3672 3673 whichSpans[0]=SPAN_ALL; 3674 whichSpansCount=1; 3675 } else if(s[0]=='-') { 3676 whichSpans[0]=SPAN_ALL; 3677 whichSpansCount=1; 3678 3679 while(*++s!=0) { 3680 switch(*s) { 3681 case 'c': 3682 whichSpansCount=addAlternative(whichSpans, whichSpansCount, 3683 ~SPAN_POLARITY, 3684 SPAN_SET, 3685 SPAN_COMPLEMENT, 3686 0); 3687 break; 3688 case 'b': 3689 whichSpansCount=addAlternative(whichSpans, whichSpansCount, 3690 ~SPAN_DIRS, 3691 SPAN_FWD, 3692 SPAN_BACK, 3693 0); 3694 break; 3695 case 'l': 3696 // test USET_SPAN_CONTAINED FWD & BACK, and separately 3697 // USET_SPAN_SIMPLE only FWD, and separately 3698 // USET_SPAN_SIMPLE only BACK 3699 whichSpansCount=addAlternative(whichSpans, whichSpansCount, 3700 ~(SPAN_DIRS|SPAN_CONDITION), 3701 SPAN_DIRS|SPAN_CONTAINED, 3702 SPAN_FWD|SPAN_SIMPLE, 3703 SPAN_BACK|SPAN_SIMPLE); 3704 break; 3705 case '8': 3706 whichSpansCount=addAlternative(whichSpans, whichSpansCount, 3707 ~SPAN_UTFS, 3708 SPAN_UTF16, 3709 SPAN_UTF8, 3710 0); 3711 break; 3712 default: 3713 errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]); 3714 break; 3715 } 3716 } 3717 } else if(0==strcmp(s, "*")) { 3718 strcpy(testNameLimit, "bad_string"); 3719 for(j=0; j<whichSpansCount; ++j) { 3720 if(whichSpansCount>1) { 3721 sprintf(testNameLimit+10 /* strlen("bad_string") */, 3722 "%%0x%3x", 3723 whichSpans[j]); 3724 } 3725 testSpanUTF16String(sets_with_str, whichSpans[j], testName); 3726 testSpanUTF8String(sets_with_str, whichSpans[j], testName); 3727 } 3728 3729 strcpy(testNameLimit, "contents"); 3730 for(j=0; j<whichSpansCount; ++j) { 3731 if(whichSpansCount>1) { 3732 sprintf(testNameLimit+8 /* strlen("contents") */, 3733 "%%0x%3x", 3734 whichSpans[j]); 3735 } 3736 testSpanContents(sets_with_str, whichSpans[j], testName); 3737 } 3738 } else { 3739 UnicodeString string=UnicodeString(s, -1, US_INV).unescape(); 3740 strcpy(testNameLimit, "test_string"); 3741 for(j=0; j<whichSpansCount; ++j) { 3742 if(whichSpansCount>1) { 3743 sprintf(testNameLimit+11 /* strlen("test_string") */, 3744 "%%0x%3x", 3745 whichSpans[j]); 3746 } 3747 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i); 3748 } 3749 } 3750 } 3751 for(j=0; j<SET_COUNT; ++j) { 3752 delete sets_with_str[j]; 3753 delete sets[j]; 3754 } 3755 } 3756 3757 // Test select patterns and strings, and test USET_SPAN_SIMPLE. 3758 void UnicodeSetTest::TestStringSpan() { 3759 static const char *pattern="[x{xy}{xya}{axy}{ax}]"; 3760 static const char *const string= 3761 "xx" 3762 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya" 3763 "xx" 3764 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya" 3765 "xx" 3766 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy" 3767 "aaaa"; 3768 3769 UErrorCode errorCode=U_ZERO_ERROR; 3770 UnicodeString pattern16=UnicodeString(pattern, -1, US_INV); 3771 UnicodeSet set(pattern16, errorCode); 3772 if(U_FAILURE(errorCode)) { 3773 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode)); 3774 return; 3775 } 3776 3777 UnicodeString string16=UnicodeString(string, -1, US_INV).unescape(); 3778 3779 if(set.containsAll(string16)) { 3780 errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string); 3781 } 3782 3783 // Remove trailing "aaaa". 3784 string16.truncate(string16.length()-4); 3785 if(!set.containsAll(string16)) { 3786 errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string); 3787 } 3788 3789 string16=UNICODE_STRING_SIMPLE("byayaxya"); 3790 const UChar *s16=string16.getBuffer(); 3791 int32_t length16=string16.length(); 3792 if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 || 3793 set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 || 3794 set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 || 3795 set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 || 3796 set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 || 3797 set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3 3798 ) { 3799 errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern); 3800 } 3801 3802 pattern="[a{ab}{abc}{cd}]"; 3803 pattern16=UnicodeString(pattern, -1, US_INV); 3804 set.applyPattern(pattern16, errorCode); 3805 if(U_FAILURE(errorCode)) { 3806 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode)); 3807 return; 3808 } 3809 string16=UNICODE_STRING_SIMPLE("acdabcdabccd"); 3810 s16=string16.getBuffer(); 3811 length16=string16.length(); 3812 if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 || 3813 set.span(s16, 12, USET_SPAN_SIMPLE)!=6 || 3814 set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5 3815 ) { 3816 errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern); 3817 } 3818 3819 pattern="[d{cd}{bcd}{ab}]"; 3820 pattern16=UnicodeString(pattern, -1, US_INV); 3821 set.applyPattern(pattern16, errorCode).freeze(); 3822 if(U_FAILURE(errorCode)) { 3823 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode)); 3824 return; 3825 } 3826 string16=UNICODE_STRING_SIMPLE("abbcdabcdabd"); 3827 s16=string16.getBuffer(); 3828 length16=string16.length(); 3829 if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 || 3830 set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 || 3831 set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0 3832 ) { 3833 errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern); 3834 } 3835 } 3836