1 /* 2 ******************************************************************************** 3 * Copyright (C) 1999-2015 International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************************** 6 * Date Name Description 7 * 10/20/99 alan Creation. 8 * 03/22/2000 Madhu Added additional tests 9 ******************************************************************************** 10 */ 11 12 #include <stdio.h> 13 14 #include <string.h> 15 #include "unicode/utypes.h" 16 #include "usettest.h" 17 #include "unicode/ucnv.h" 18 #include "unicode/uniset.h" 19 #include "unicode/uchar.h" 20 #include "unicode/usetiter.h" 21 #include "unicode/ustring.h" 22 #include "unicode/parsepos.h" 23 #include "unicode/symtable.h" 24 #include "unicode/uversion.h" 25 #include "hash.h" 26 27 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \ 28 dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \ 29 u_errorName(status));}} 30 31 #define TEST_ASSERT(expr) {if (!(expr)) { \ 32 dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); }} 33 34 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) { 35 UnicodeString pat; 36 set.toPattern(pat); 37 return left + UnicodeSetTest::escape(pat); 38 } 39 40 #define CASE(id,test) case id: \ 41 name = #test; \ 42 if (exec) { \ 43 logln(#test "---"); \ 44 logln(); \ 45 test(); \ 46 } \ 47 break 48 49 UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) { 50 } 51 52 UConverter *UnicodeSetTest::openUTF8Converter() { 53 if(utf8Cnv==NULL) { 54 UErrorCode errorCode=U_ZERO_ERROR; 55 utf8Cnv=ucnv_open("UTF-8", &errorCode); 56 } 57 return utf8Cnv; 58 } 59 60 UnicodeSetTest::~UnicodeSetTest() { 61 ucnv_close(utf8Cnv); 62 } 63 64 void 65 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec, 66 const char* &name, char* /*par*/) { 67 // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest"); 68 switch (index) { 69 CASE(0,TestPatterns); 70 CASE(1,TestAddRemove); 71 CASE(2,TestCategories); 72 CASE(3,TestCloneEqualHash); 73 CASE(4,TestMinimalRep); 74 CASE(5,TestAPI); 75 CASE(6,TestScriptSet); 76 CASE(7,TestPropertySet); 77 CASE(8,TestClone); 78 CASE(9,TestExhaustive); 79 CASE(10,TestToPattern); 80 CASE(11,TestIndexOf); 81 CASE(12,TestStrings); 82 CASE(13,Testj2268); 83 CASE(14,TestCloseOver); 84 CASE(15,TestEscapePattern); 85 CASE(16,TestInvalidCodePoint); 86 CASE(17,TestSymbolTable); 87 CASE(18,TestSurrogate); 88 CASE(19,TestPosixClasses); 89 CASE(20,TestIteration); 90 CASE(21,TestFreezable); 91 CASE(22,TestSpan); 92 CASE(23,TestStringSpan); 93 CASE(24,TestUCAUnsafeBackwards); 94 default: name = ""; break; 95 } 96 } 97 98 static const char NOT[] = "%%%%"; 99 100 /** 101 * UVector was improperly copying contents 102 * This code will crash this is still true 103 */ 104 void UnicodeSetTest::Testj2268() { 105 UnicodeSet t; 106 t.add(UnicodeString("abc")); 107 UnicodeSet test(t); 108 UnicodeString ustrPat; 109 test.toPattern(ustrPat, TRUE); 110 } 111 112 /** 113 * Test toPattern(). 114 */ 115 void UnicodeSetTest::TestToPattern() { 116 UErrorCode ec = U_ZERO_ERROR; 117 118 // Test that toPattern() round trips with syntax characters and 119 // whitespace. 120 { 121 static const char* OTHER_TOPATTERN_TESTS[] = { 122 "[[:latin:]&[:greek:]]", 123 "[[:latin:]-[:greek:]]", 124 "[:nonspacing mark:]", 125 NULL 126 }; 127 128 for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) { 129 ec = U_ZERO_ERROR; 130 UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec); 131 if (U_FAILURE(ec)) { 132 dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec))); 133 continue; 134 } 135 checkPat(OTHER_TOPATTERN_TESTS[j], s); 136 } 137 138 for (UChar32 i = 0; i <= 0x10FFFF; ++i) { 139 if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) { 140 141 // check various combinations to make sure they all work. 142 if (i != 0 && !toPatternAux(i, i)){ 143 continue; 144 } 145 if (!toPatternAux(0, i)){ 146 continue; 147 } 148 if (!toPatternAux(i, 0xFFFF)){ 149 continue; 150 } 151 } 152 } 153 } 154 155 // Test pattern behavior of multicharacter strings. 156 { 157 ec = U_ZERO_ERROR; 158 UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec); 159 160 // This loop isn't a loop. It's here to make the compiler happy. 161 // If you're curious, try removing it and changing the 'break' 162 // statements (except for the last) to goto's. 163 for (;;) { 164 if (U_FAILURE(ec)) break; 165 const char* exp1[] = {"aa", "ab", NOT, "ac", NULL}; 166 expectToPattern(*s, "[a-z{aa}{ab}]", exp1); 167 168 s->add("ac"); 169 const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL}; 170 expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2); 171 172 s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec); 173 if (U_FAILURE(ec)) break; 174 const char* exp3[] = {"{l", "r}", NOT, "xy", NULL}; 175 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3); 176 177 s->add("[]"); 178 const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL}; 179 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4); 180 181 s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec); 182 if (U_FAILURE(ec)) break; 183 const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL}; 184 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5); 185 186 // j2189 187 s->clear(); 188 s->add(UnicodeString("abc", "")); 189 s->add(UnicodeString("abc", "")); 190 const char* exp6[] = {"abc", NOT, "ab", NULL}; 191 expectToPattern(*s, "[{abc}]", exp6); 192 193 break; 194 } 195 196 if (U_FAILURE(ec)) errln("FAIL: pattern parse error"); 197 delete s; 198 } 199 200 // JB#3400: For 2 character ranges prefer [ab] to [a-b] 201 UnicodeSet s; 202 s.add((UChar)97, (UChar)98); // 'a', 'b' 203 expectToPattern(s, "[ab]", NULL); 204 } 205 206 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) { 207 208 // use Integer.toString because Utility.hex doesn't handle ints 209 UnicodeString pat = ""; 210 // TODO do these in hex 211 //String source = "0x" + Integer.toString(start,16).toUpperCase(); 212 //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase(); 213 UnicodeString source; 214 source = source + (uint32_t)start; 215 if (start != end) 216 source = source + ".." + (uint32_t)end; 217 UnicodeSet testSet; 218 testSet.add(start, end); 219 return checkPat(source, testSet); 220 } 221 222 UBool UnicodeSetTest::checkPat(const UnicodeString& source, 223 const UnicodeSet& testSet) { 224 // What we want to make sure of is that a pattern generated 225 // by toPattern(), with or without escaped unprintables, can 226 // be passed back into the UnicodeSet constructor. 227 UnicodeString pat0; 228 229 testSet.toPattern(pat0, TRUE); 230 231 if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE; 232 233 //String pat1 = unescapeLeniently(pat0); 234 //if (!checkPat(source + " (in code)", testSet, pat1)) return false; 235 236 UnicodeString pat2; 237 testSet.toPattern(pat2, FALSE); 238 if (!checkPat(source, testSet, pat2)) return FALSE; 239 240 //String pat3 = unescapeLeniently(pat2); 241 // if (!checkPat(source + " (in code)", testSet, pat3)) return false; 242 243 //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3); 244 logln((UnicodeString)source + " => " + pat0 + ", " + pat2); 245 return TRUE; 246 } 247 248 UBool UnicodeSetTest::checkPat(const UnicodeString& source, 249 const UnicodeSet& testSet, 250 const UnicodeString& pat) { 251 UErrorCode ec = U_ZERO_ERROR; 252 UnicodeSet testSet2(pat, ec); 253 if (testSet2 != testSet) { 254 errln((UnicodeString)"Fail toPattern: " + source + " => " + pat); 255 return FALSE; 256 } 257 return TRUE; 258 } 259 260 void 261 UnicodeSetTest::TestPatterns(void) { 262 UnicodeSet set; 263 expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""), "km"); 264 expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""), "aczz"); 265 expectPattern(set, UnicodeString("[a\\-z]", ""), "--aazz"); 266 expectPattern(set, UnicodeString("[-az]", ""), "--aazz"); 267 expectPattern(set, UnicodeString("[az-]", ""), "--aazz"); 268 expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz"); 269 270 // Throw in a test of complement 271 set.complement(); 272 UnicodeString exp; 273 exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF); 274 expectPairs(set, exp); 275 } 276 277 void 278 UnicodeSetTest::TestCategories(void) { 279 UErrorCode status = U_ZERO_ERROR; 280 const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:] 281 UnicodeSet set(pat, status); 282 if (U_FAILURE(status)) { 283 dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status))); 284 return; 285 } else { 286 expectContainment(set, pat, "ABC", "abc"); 287 } 288 289 UChar32 i; 290 int32_t failures = 0; 291 // Make sure generation of L doesn't pollute cached Lu set 292 // First generate L, then Lu 293 set.applyPattern("[:L:]", status); 294 if (U_FAILURE(status)) { errln("FAIL"); return; } 295 for (i=0; i<0x200; ++i) { 296 UBool l = u_isalpha((UChar)i); 297 if (l != set.contains(i)) { 298 errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " + 299 set.contains(i)); 300 if (++failures == 10) break; 301 } 302 } 303 304 set.applyPattern("[:Lu:]", status); 305 if (U_FAILURE(status)) { errln("FAIL"); return; } 306 for (i=0; i<0x200; ++i) { 307 UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER); 308 if (lu != set.contains(i)) { 309 errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " + 310 set.contains(i)); 311 if (++failures == 20) break; 312 } 313 } 314 } 315 void 316 UnicodeSetTest::TestCloneEqualHash(void) { 317 UErrorCode status = U_ZERO_ERROR; 318 // set1 and set2 used to be built with the obsolete constructor taking 319 // UCharCategory values; replaced with pattern constructors 320 // markus 20030502 321 UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status); // :Ll: Letter, lowercase 322 UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); // Letter, lowercase 323 if (U_FAILURE(status)){ 324 dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status))); 325 return; 326 } 327 UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status); //Number, Decimal digit 328 UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status); //Number, Decimal digit 329 if (U_FAILURE(status)){ 330 errln((UnicodeString)"FAIL: Can't construct set with category->Nd"); 331 return; 332 } 333 334 if (*set1 != *set1a) { 335 errln("FAIL: category constructor for Ll broken"); 336 } 337 if (*set2 != *set2a) { 338 errln("FAIL: category constructor for Nd broken"); 339 } 340 delete set1a; 341 delete set2a; 342 343 logln("Testing copy construction"); 344 UnicodeSet *set1copy=new UnicodeSet(*set1); 345 if(*set1 != *set1copy || *set1 == *set2 || 346 getPairs(*set1) != getPairs(*set1copy) || 347 set1->hashCode() != set1copy->hashCode()){ 348 errln("FAIL : Error in copy construction"); 349 return; 350 } 351 352 logln("Testing =operator"); 353 UnicodeSet set1equal=*set1; 354 UnicodeSet set2equal=*set2; 355 if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 || 356 set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){ 357 errln("FAIL: Error in =operator"); 358 } 359 360 logln("Testing clone()"); 361 UnicodeSet *set1clone=(UnicodeSet*)set1->clone(); 362 UnicodeSet *set2clone=(UnicodeSet*)set2->clone(); 363 if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal || 364 *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal || 365 *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){ 366 errln("FAIL: Error in clone"); 367 } 368 369 logln("Testing hashcode"); 370 if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() || 371 set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() || 372 set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() || 373 set1->hashCode() == set2->hashCode() || set1copy->hashCode() == set2->hashCode() || 374 set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){ 375 errln("FAIL: Error in hashCode()"); 376 } 377 378 delete set1; 379 delete set1copy; 380 delete set2; 381 delete set1clone; 382 delete set2clone; 383 384 385 } 386 void 387 UnicodeSetTest::TestAddRemove(void) { 388 UnicodeSet set; // Construct empty set 389 doAssert(set.isEmpty() == TRUE, "set should be empty"); 390 doAssert(set.size() == 0, "size should be 0"); 391 set.complement(); 392 doAssert(set.size() == 0x110000, "size should be 0x110000"); 393 set.clear(); 394 set.add(0x0061, 0x007a); 395 expectPairs(set, "az"); 396 doAssert(set.isEmpty() == FALSE, "set should not be empty"); 397 doAssert(set.size() != 0, "size should not be equal to 0"); 398 doAssert(set.size() == 26, "size should be equal to 26"); 399 set.remove(0x006d, 0x0070); 400 expectPairs(set, "alqz"); 401 doAssert(set.size() == 22, "size should be equal to 22"); 402 set.remove(0x0065, 0x0067); 403 expectPairs(set, "adhlqz"); 404 doAssert(set.size() == 19, "size should be equal to 19"); 405 set.remove(0x0064, 0x0069); 406 expectPairs(set, "acjlqz"); 407 doAssert(set.size() == 16, "size should be equal to 16"); 408 set.remove(0x0063, 0x0072); 409 expectPairs(set, "absz"); 410 doAssert(set.size() == 10, "size should be equal to 10"); 411 set.add(0x0066, 0x0071); 412 expectPairs(set, "abfqsz"); 413 doAssert(set.size() == 22, "size should be equal to 22"); 414 set.remove(0x0061, 0x0067); 415 expectPairs(set, "hqsz"); 416 set.remove(0x0061, 0x007a); 417 expectPairs(set, ""); 418 doAssert(set.isEmpty() == TRUE, "set should be empty"); 419 doAssert(set.size() == 0, "size should be 0"); 420 set.add(0x0061); 421 doAssert(set.isEmpty() == FALSE, "set should not be empty"); 422 doAssert(set.size() == 1, "size should not be equal to 1"); 423 set.add(0x0062); 424 set.add(0x0063); 425 expectPairs(set, "ac"); 426 doAssert(set.size() == 3, "size should not be equal to 3"); 427 set.add(0x0070); 428 set.add(0x0071); 429 expectPairs(set, "acpq"); 430 doAssert(set.size() == 5, "size should not be equal to 5"); 431 set.clear(); 432 expectPairs(set, ""); 433 doAssert(set.isEmpty() == TRUE, "set should be empty"); 434 doAssert(set.size() == 0, "size should be 0"); 435 436 // Try removing an entire set from another set 437 expectPattern(set, "[c-x]", "cx"); 438 UnicodeSet set2; 439 expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz"); 440 set.removeAll(set2); 441 expectPairs(set, "deluxx"); 442 443 // Try adding an entire set to another set 444 expectPattern(set, "[jackiemclean]", "aacceein"); 445 expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort"); 446 set.addAll(set2); 447 expectPairs(set, "aacehort"); 448 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2"); 449 450 // Try retaining an set of elements contained in another set (intersection) 451 UnicodeSet set3; 452 expectPattern(set3, "[a-c]", "ac"); 453 doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3"); 454 set3.remove(0x0062); 455 expectPairs(set3, "aacc"); 456 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3"); 457 set.retainAll(set3); 458 expectPairs(set, "aacc"); 459 doAssert(set.size() == set3.size(), "set.size() should be set3.size()"); 460 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3"); 461 set.clear(); 462 doAssert(set.size() != set3.size(), "set.size() != set3.size()"); 463 464 // Test commutativity 465 expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort"); 466 expectPattern(set2, "[jackiemclean]", "aacceein"); 467 set.addAll(set2); 468 expectPairs(set, "aacehort"); 469 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2"); 470 471 472 473 474 } 475 476 /** 477 * Make sure minimal representation is maintained. 478 */ 479 void UnicodeSetTest::TestMinimalRep() { 480 UErrorCode status = U_ZERO_ERROR; 481 // This is pretty thoroughly tested by checkCanonicalRep() 482 // run against the exhaustive operation results. Use the code 483 // here for debugging specific spot problems. 484 485 // 1 overlap against 2 486 UnicodeSet set("[h-km-q]", status); 487 if (U_FAILURE(status)) { errln("FAIL"); return; } 488 UnicodeSet set2("[i-o]", status); 489 if (U_FAILURE(status)) { errln("FAIL"); return; } 490 set.addAll(set2); 491 expectPairs(set, "hq"); 492 // right 493 set.applyPattern("[a-m]", status); 494 if (U_FAILURE(status)) { errln("FAIL"); return; } 495 set2.applyPattern("[e-o]", status); 496 if (U_FAILURE(status)) { errln("FAIL"); return; } 497 set.addAll(set2); 498 expectPairs(set, "ao"); 499 // left 500 set.applyPattern("[e-o]", status); 501 if (U_FAILURE(status)) { errln("FAIL"); return; } 502 set2.applyPattern("[a-m]", status); 503 if (U_FAILURE(status)) { errln("FAIL"); return; } 504 set.addAll(set2); 505 expectPairs(set, "ao"); 506 // 1 overlap against 3 507 set.applyPattern("[a-eg-mo-w]", status); 508 if (U_FAILURE(status)) { errln("FAIL"); return; } 509 set2.applyPattern("[d-q]", status); 510 if (U_FAILURE(status)) { errln("FAIL"); return; } 511 set.addAll(set2); 512 expectPairs(set, "aw"); 513 } 514 515 void UnicodeSetTest::TestAPI() { 516 UErrorCode status = U_ZERO_ERROR; 517 // default ct 518 UnicodeSet set; 519 if (!set.isEmpty() || set.getRangeCount() != 0) { 520 errln((UnicodeString)"FAIL, set should be empty but isn't: " + 521 set); 522 } 523 524 // clear(), isEmpty() 525 set.add(0x0061); 526 if (set.isEmpty()) { 527 errln((UnicodeString)"FAIL, set shouldn't be empty but is: " + 528 set); 529 } 530 set.clear(); 531 if (!set.isEmpty()) { 532 errln((UnicodeString)"FAIL, set should be empty but isn't: " + 533 set); 534 } 535 536 // size() 537 set.clear(); 538 if (set.size() != 0) { 539 errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() + 540 ": " + set); 541 } 542 set.add(0x0061); 543 if (set.size() != 1) { 544 errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() + 545 ": " + set); 546 } 547 set.add(0x0031, 0x0039); 548 if (set.size() != 10) { 549 errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() + 550 ": " + set); 551 } 552 553 // contains(first, last) 554 set.clear(); 555 set.applyPattern("[A-Y 1-8 b-d l-y]", status); 556 if (U_FAILURE(status)) { errln("FAIL"); return; } 557 for (int32_t i = 0; i<set.getRangeCount(); ++i) { 558 UChar32 a = set.getRangeStart(i); 559 UChar32 b = set.getRangeEnd(i); 560 if (!set.contains(a, b)) { 561 errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b + 562 " but doesn't: " + set); 563 } 564 if (set.contains((UChar32)(a-1), b)) { 565 errln((UnicodeString)"FAIL, shouldn't contain " + 566 (unsigned short)(a-1) + '-' + (unsigned short)b + 567 " but does: " + set); 568 } 569 if (set.contains(a, (UChar32)(b+1))) { 570 errln((UnicodeString)"FAIL, shouldn't contain " + 571 (unsigned short)a + '-' + (unsigned short)(b+1) + 572 " but does: " + set); 573 } 574 } 575 576 // Ported InversionList test. 577 UnicodeSet a((UChar32)3,(UChar32)10); 578 UnicodeSet b((UChar32)7,(UChar32)15); 579 UnicodeSet c; 580 581 logln((UnicodeString)"a [3-10]: " + a); 582 logln((UnicodeString)"b [7-15]: " + b); 583 c = a; 584 c.addAll(b); 585 UnicodeSet exp((UChar32)3,(UChar32)15); 586 if (c == exp) { 587 logln((UnicodeString)"c.set(a).add(b): " + c); 588 } else { 589 errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp); 590 } 591 c.complement(); 592 exp.set((UChar32)0, (UChar32)2); 593 exp.add((UChar32)16, UnicodeSet::MAX_VALUE); 594 if (c == exp) { 595 logln((UnicodeString)"c.complement(): " + c); 596 } else { 597 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp); 598 } 599 c.complement(); 600 exp.set((UChar32)3, (UChar32)15); 601 if (c == exp) { 602 logln((UnicodeString)"c.complement(): " + c); 603 } else { 604 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp); 605 } 606 c = a; 607 c.complementAll(b); 608 exp.set((UChar32)3,(UChar32)6); 609 exp.add((UChar32)11,(UChar32) 15); 610 if (c == exp) { 611 logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c); 612 } else { 613 errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp); 614 } 615 616 exp = c; 617 bitsToSet(setToBits(c), c); 618 if (c == exp) { 619 logln((UnicodeString)"bitsToSet(setToBits(c)): " + c); 620 } else { 621 errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp); 622 } 623 624 // Additional tests for coverage JB#2118 625 //UnicodeSet::complement(class UnicodeString const &) 626 //UnicodeSet::complementAll(class UnicodeString const &) 627 //UnicodeSet::containsNone(class UnicodeSet const &) 628 //UnicodeSet::containsNone(long,long) 629 //UnicodeSet::containsSome(class UnicodeSet const &) 630 //UnicodeSet::containsSome(long,long) 631 //UnicodeSet::removeAll(class UnicodeString const &) 632 //UnicodeSet::retain(long) 633 //UnicodeSet::retainAll(class UnicodeString const &) 634 //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &) 635 //UnicodeSetIterator::getString(void) 636 set.clear(); 637 set.complement("ab"); 638 exp.applyPattern("[{ab}]", status); 639 if (U_FAILURE(status)) { errln("FAIL"); return; } 640 if (set != exp) { errln("FAIL: complement(\"ab\")"); return; } 641 642 UnicodeSetIterator iset(set); 643 if (!iset.next() || !iset.isString()) { 644 errln("FAIL: UnicodeSetIterator::next/isString"); 645 } else if (iset.getString() != "ab") { 646 errln("FAIL: UnicodeSetIterator::getString"); 647 } 648 649 set.add((UChar32)0x61, (UChar32)0x7A); 650 set.complementAll("alan"); 651 exp.applyPattern("[{ab}b-kmo-z]", status); 652 if (U_FAILURE(status)) { errln("FAIL"); return; } 653 if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; } 654 655 exp.applyPattern("[a-z]", status); 656 if (U_FAILURE(status)) { errln("FAIL"); return; } 657 if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); } 658 if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); } 659 exp.applyPattern("[aln]", status); 660 if (U_FAILURE(status)) { errln("FAIL"); return; } 661 if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); } 662 if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); } 663 664 if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) { 665 errln("FAIL: containsNone(UChar32, UChar32)"); 666 } 667 if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) { 668 errln("FAIL: containsSome(UChar32, UChar32)"); 669 } 670 if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) { 671 errln("FAIL: containsNone(UChar32, UChar32)"); 672 } 673 if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) { 674 errln("FAIL: containsSome(UChar32, UChar32)"); 675 } 676 677 set.removeAll("liu"); 678 exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status); 679 if (U_FAILURE(status)) { errln("FAIL"); return; } 680 if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; } 681 682 set.retainAll("star"); 683 exp.applyPattern("[rst]", status); 684 if (U_FAILURE(status)) { errln("FAIL"); return; } 685 if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; } 686 687 set.retain((UChar32)0x73); 688 exp.applyPattern("[s]", status); 689 if (U_FAILURE(status)) { errln("FAIL"); return; } 690 if (set != exp) { errln("FAIL: retain('s')"); return; } 691 692 uint16_t buf[32]; 693 int32_t slen = set.serialize(buf, sizeof(buf)/sizeof(buf[0]), status); 694 if (U_FAILURE(status)) { errln("FAIL: serialize"); return; } 695 if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) { 696 errln("FAIL: serialize"); 697 return; 698 } 699 700 // Conversions to and from USet 701 UnicodeSet *uniset = &set; 702 USet *uset = uniset->toUSet(); 703 TEST_ASSERT((void *)uset == (void *)uniset); 704 UnicodeSet *setx = UnicodeSet::fromUSet(uset); 705 TEST_ASSERT((void *)setx == (void *)uset); 706 const UnicodeSet *constSet = uniset; 707 const USet *constUSet = constSet->toUSet(); 708 TEST_ASSERT((void *)constUSet == (void *)constSet); 709 const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet); 710 TEST_ASSERT((void *)constSetx == (void *)constUSet); 711 712 // span(UnicodeString) and spanBack(UnicodeString) convenience methods 713 UnicodeString longString=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccccc"); 714 UnicodeSet ac(0x61, 0x63); 715 ac.remove(0x62).freeze(); 716 if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 || 717 ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 || 718 ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 || 719 ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 || 720 ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 || 721 ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 || 722 ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 || 723 ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 || 724 ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 || 725 ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30 726 ) { 727 errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes"); 728 } 729 if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 || 730 ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 || 731 ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 || 732 ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 || 733 ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 || 734 ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 || 735 ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 || 736 ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 || 737 ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 || 738 ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20 739 ) { 740 errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes"); 741 } 742 } 743 744 void UnicodeSetTest::TestIteration() { 745 UErrorCode ec = U_ZERO_ERROR; 746 int i = 0; 747 int outerLoop; 748 749 // 6 code points, 3 ranges, 2 strings, 8 total elements 750 // Iteration will access them in sorted order - a, b, c, y, z, U0001abcd, "str1", "str2" 751 UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec); 752 TEST_ASSERT_SUCCESS(ec); 753 UnicodeSetIterator it(set); 754 755 for (outerLoop=0; outerLoop<3; outerLoop++) { 756 // Run the test multiple times, to check that iterator.reset() is working. 757 for (i=0; i<10; i++) { 758 UBool nextv = it.next(); 759 UBool isString = it.isString(); 760 int32_t codePoint = it.getCodepoint(); 761 //int32_t codePointEnd = it.getCodepointEnd(); 762 UnicodeString s = it.getString(); 763 switch (i) { 764 case 0: 765 TEST_ASSERT(nextv == TRUE); 766 TEST_ASSERT(isString == FALSE); 767 TEST_ASSERT(codePoint==0x61); 768 TEST_ASSERT(s == "a"); 769 break; 770 case 1: 771 TEST_ASSERT(nextv == TRUE); 772 TEST_ASSERT(isString == FALSE); 773 TEST_ASSERT(codePoint==0x62); 774 TEST_ASSERT(s == "b"); 775 break; 776 case 2: 777 TEST_ASSERT(nextv == TRUE); 778 TEST_ASSERT(isString == FALSE); 779 TEST_ASSERT(codePoint==0x63); 780 TEST_ASSERT(s == "c"); 781 break; 782 case 3: 783 TEST_ASSERT(nextv == TRUE); 784 TEST_ASSERT(isString == FALSE); 785 TEST_ASSERT(codePoint==0x79); 786 TEST_ASSERT(s == "y"); 787 break; 788 case 4: 789 TEST_ASSERT(nextv == TRUE); 790 TEST_ASSERT(isString == FALSE); 791 TEST_ASSERT(codePoint==0x7a); 792 TEST_ASSERT(s == "z"); 793 break; 794 case 5: 795 TEST_ASSERT(nextv == TRUE); 796 TEST_ASSERT(isString == FALSE); 797 TEST_ASSERT(codePoint==0x1abcd); 798 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd)); 799 break; 800 case 6: 801 TEST_ASSERT(nextv == TRUE); 802 TEST_ASSERT(isString == TRUE); 803 TEST_ASSERT(s == "str1"); 804 break; 805 case 7: 806 TEST_ASSERT(nextv == TRUE); 807 TEST_ASSERT(isString == TRUE); 808 TEST_ASSERT(s == "str2"); 809 break; 810 case 8: 811 TEST_ASSERT(nextv == FALSE); 812 break; 813 case 9: 814 TEST_ASSERT(nextv == FALSE); 815 break; 816 } 817 } 818 it.reset(); // prepare to run the iteration again. 819 } 820 } 821 822 823 824 825 void UnicodeSetTest::TestStrings() { 826 UErrorCode ec = U_ZERO_ERROR; 827 828 UnicodeSet* testList[] = { 829 UnicodeSet::createFromAll("abc"), 830 new UnicodeSet("[a-c]", ec), 831 832 &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")), 833 new UnicodeSet("[{ll}{ch}a-z]", ec), 834 835 UnicodeSet::createFrom("ab}c"), 836 new UnicodeSet("[{ab\\}c}]", ec), 837 838 &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')), 839 new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec), 840 841 NULL 842 }; 843 844 if (U_FAILURE(ec)) { 845 errln("FAIL: couldn't construct test sets"); 846 } 847 848 for (int32_t i = 0; testList[i] != NULL; i+=2) { 849 if (U_SUCCESS(ec)) { 850 UnicodeString pat0, pat1; 851 testList[i]->toPattern(pat0, TRUE); 852 testList[i+1]->toPattern(pat1, TRUE); 853 if (*testList[i] == *testList[i+1]) { 854 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1); 855 } else { 856 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1); 857 } 858 } 859 delete testList[i]; 860 delete testList[i+1]; 861 } 862 } 863 864 /** 865 * Test the [:Latin:] syntax. 866 */ 867 void UnicodeSetTest::TestScriptSet() { 868 expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1")); 869 870 expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA"); 871 872 /* Jitterbug 1423 */ 873 expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA"); 874 875 } 876 877 /** 878 * Test the [:Latin:] syntax. 879 */ 880 void UnicodeSetTest::TestPropertySet() { 881 static const char* const DATA[] = { 882 // Pattern, Chars IN, Chars NOT in 883 884 "[:Latin:]", 885 "aA", 886 "\\u0391\\u03B1", 887 888 "[\\p{Greek}]", 889 "\\u0391\\u03B1", 890 "aA", 891 892 "\\P{ GENERAL Category = upper case letter }", 893 "abc", 894 "ABC", 895 896 #if !UCONFIG_NO_NORMALIZATION 897 // Combining class: @since ICU 2.2 898 // Check both symbolic and numeric 899 "\\p{ccc=Nukta}", 900 "\\u0ABC", 901 "abc", 902 903 "\\p{Canonical Combining Class = 11}", 904 "\\u05B1", 905 "\\u05B2", 906 907 "[:c c c = iota subscript :]", 908 "\\u0345", 909 "xyz", 910 #endif 911 912 // Bidi class: @since ICU 2.2 913 "\\p{bidiclass=lefttoright}", 914 "abc", 915 "\\u0671\\u0672", 916 917 // Binary properties: @since ICU 2.2 918 "\\p{ideographic}", 919 "\\u4E0A", 920 "x", 921 922 "[:math=false:]", 923 "q)*(", 924 // weiv: )(and * were removed from math in Unicode 4.0.1 925 //"(*+)", 926 "+<>^", 927 928 // JB#1767 \N{}, \p{ASCII} 929 "[:Ascii:]", 930 "abc\\u0000\\u007F", 931 "\\u0080\\u4E00", 932 933 "[\\N{ latin small letter a }[:name= latin small letter z:]]", 934 "az", 935 "qrs", 936 937 // JB#2015 938 "[:any:]", 939 "a\\U0010FFFF", 940 "", 941 942 "[:nv=0.5:]", 943 "\\u00BD\\u0F2A", 944 "\\u00BC", 945 946 // JB#2653: Age 947 "[:Age=1.1:]", 948 "\\u03D6", // 1.1 949 "\\u03D8\\u03D9", // 3.2 950 951 "[:Age=3.1:]", 952 "\\u1800\\u3400\\U0002f800", 953 "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000", 954 955 // JB#2350: Case_Sensitive 956 "[:Case Sensitive:]", 957 "A\\u1FFC\\U00010410", 958 ";\\u00B4\\U00010500", 959 960 // JB#2832: C99-compatibility props 961 "[:blank:]", 962 " \\u0009", 963 "1-9A-Z", 964 965 "[:graph:]", 966 "19AZ", 967 " \\u0003\\u0007\\u0009\\u000A\\u000D", 968 969 "[:punct:]", 970 "!@#%&*()[]{}-_\\/;:,.?'\"", 971 "09azAZ", 972 973 "[:xdigit:]", 974 "09afAF", 975 "gG!", 976 977 // Regex compatibility test 978 "[-b]", // leading '-' is literal 979 "-b", 980 "ac", 981 982 "[^-b]", // leading '-' is literal 983 "ac", 984 "-b", 985 986 "[b-]", // trailing '-' is literal 987 "-b", 988 "ac", 989 990 "[^b-]", // trailing '-' is literal 991 "ac", 992 "-b", 993 994 "[a-b-]", // trailing '-' is literal 995 "ab-", 996 "c=", 997 998 "[[a-q]&[p-z]-]", // trailing '-' is literal 999 "pq-", 1000 "or=", 1001 1002 "[\\s|\\)|:|$|\\>]", // from regex tests 1003 "s|):$>", 1004 "abc", 1005 1006 "[\\uDC00cd]", // JB#2906: isolated trail at start 1007 "cd\\uDC00", 1008 "ab\\uD800\\U00010000", 1009 1010 "[ab\\uD800]", // JB#2906: isolated trail at start 1011 "ab\\uD800", 1012 "cd\\uDC00\\U00010000", 1013 1014 "[ab\\uD800cd]", // JB#2906: isolated lead in middle 1015 "abcd\\uD800", 1016 "ef\\uDC00\\U00010000", 1017 1018 "[ab\\uDC00cd]", // JB#2906: isolated trail in middle 1019 "abcd\\uDC00", 1020 "ef\\uD800\\U00010000", 1021 1022 #if !UCONFIG_NO_NORMALIZATION 1023 "[:^lccc=0:]", // Lead canonical class 1024 "\\u0300\\u0301", 1025 "abcd\\u00c0\\u00c5", 1026 1027 "[:^tccc=0:]", // Trail canonical class 1028 "\\u0300\\u0301\\u00c0\\u00c5", 1029 "abcd", 1030 1031 "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class 1032 "\\u0300\\u0301\\u00c0\\u00c5", 1033 "abcd", 1034 1035 "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now) 1036 "", 1037 "abcd\\u0300\\u0301\\u00c0\\u00c5", 1038 1039 "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not 1040 "\\u0F73\\u0F75\\u0F81", 1041 "abcd\\u0300\\u0301\\u00c0\\u00c5", 1042 #endif /* !UCONFIG_NO_NORMALIZATION */ 1043 1044 "[:Assigned:]", 1045 "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD", 1046 "\\u0888\\uFDD3\\uFFFE\\U00050005", 1047 1048 // Script_Extensions, new in Unicode 6.0 1049 "[:scx=Arab:]", 1050 "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3", 1051 "\\u061D\\uFDEF\\uFDFE", 1052 1053 // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions, 1054 // so scx-sc is missing U+FDF2. 1055 "[[:Script_Extensions=Arabic:]-[:Arab:]]", 1056 "\\u0640\\u064B\\u0650\\u0655", 1057 "\\uFDF2" 1058 }; 1059 1060 static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]); 1061 1062 for (int32_t i=0; i<DATA_LEN; i+=3) { 1063 expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]), 1064 CharsToUnicodeString(DATA[i+2])); 1065 } 1066 } 1067 1068 /** 1069 * Test that Posix style character classes [:digit:], etc. 1070 * have the Unicode definitions from TR 18. 1071 */ 1072 void UnicodeSetTest::TestPosixClasses() { 1073 { 1074 UErrorCode status = U_ZERO_ERROR; 1075 UnicodeSet s1("[:alpha:]", status); 1076 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status); 1077 TEST_ASSERT_SUCCESS(status); 1078 TEST_ASSERT(s1==s2); 1079 } 1080 { 1081 UErrorCode status = U_ZERO_ERROR; 1082 UnicodeSet s1("[:lower:]", status); 1083 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status); 1084 TEST_ASSERT_SUCCESS(status); 1085 TEST_ASSERT(s1==s2); 1086 } 1087 { 1088 UErrorCode status = U_ZERO_ERROR; 1089 UnicodeSet s1("[:upper:]", status); 1090 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status); 1091 TEST_ASSERT_SUCCESS(status); 1092 TEST_ASSERT(s1==s2); 1093 } 1094 { 1095 UErrorCode status = U_ZERO_ERROR; 1096 UnicodeSet s1("[:punct:]", status); 1097 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status); 1098 TEST_ASSERT_SUCCESS(status); 1099 TEST_ASSERT(s1==s2); 1100 } 1101 { 1102 UErrorCode status = U_ZERO_ERROR; 1103 UnicodeSet s1("[:digit:]", status); 1104 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status); 1105 TEST_ASSERT_SUCCESS(status); 1106 TEST_ASSERT(s1==s2); 1107 } 1108 { 1109 UErrorCode status = U_ZERO_ERROR; 1110 UnicodeSet s1("[:xdigit:]", status); 1111 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status); 1112 TEST_ASSERT_SUCCESS(status); 1113 TEST_ASSERT(s1==s2); 1114 } 1115 { 1116 UErrorCode status = U_ZERO_ERROR; 1117 UnicodeSet s1("[:alnum:]", status); 1118 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status); 1119 TEST_ASSERT_SUCCESS(status); 1120 TEST_ASSERT(s1==s2); 1121 } 1122 { 1123 UErrorCode status = U_ZERO_ERROR; 1124 UnicodeSet s1("[:space:]", status); 1125 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status); 1126 TEST_ASSERT_SUCCESS(status); 1127 TEST_ASSERT(s1==s2); 1128 } 1129 { 1130 UErrorCode status = U_ZERO_ERROR; 1131 UnicodeSet s1("[:blank:]", status); 1132 TEST_ASSERT_SUCCESS(status); 1133 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"), 1134 status); 1135 TEST_ASSERT_SUCCESS(status); 1136 TEST_ASSERT(s1==s2); 1137 } 1138 { 1139 UErrorCode status = U_ZERO_ERROR; 1140 UnicodeSet s1("[:cntrl:]", status); 1141 TEST_ASSERT_SUCCESS(status); 1142 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status); 1143 TEST_ASSERT_SUCCESS(status); 1144 TEST_ASSERT(s1==s2); 1145 } 1146 { 1147 UErrorCode status = U_ZERO_ERROR; 1148 UnicodeSet s1("[:graph:]", status); 1149 TEST_ASSERT_SUCCESS(status); 1150 UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status); 1151 TEST_ASSERT_SUCCESS(status); 1152 TEST_ASSERT(s1==s2); 1153 } 1154 { 1155 UErrorCode status = U_ZERO_ERROR; 1156 UnicodeSet s1("[:print:]", status); 1157 TEST_ASSERT_SUCCESS(status); 1158 UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status); 1159 TEST_ASSERT_SUCCESS(status); 1160 TEST_ASSERT(s1==s2); 1161 } 1162 } 1163 /** 1164 * Test cloning of UnicodeSet. For C++, we test the copy constructor. 1165 */ 1166 void UnicodeSetTest::TestClone() { 1167 UErrorCode ec = U_ZERO_ERROR; 1168 UnicodeSet s("[abcxyz]", ec); 1169 UnicodeSet t(s); 1170 expectContainment(t, "abc", "def"); 1171 } 1172 1173 /** 1174 * Test the indexOf() and charAt() methods. 1175 */ 1176 void UnicodeSetTest::TestIndexOf() { 1177 UErrorCode ec = U_ZERO_ERROR; 1178 UnicodeSet set("[a-cx-y3578]", ec); 1179 if (U_FAILURE(ec)) { 1180 errln("FAIL: UnicodeSet constructor"); 1181 return; 1182 } 1183 for (int32_t i=0; i<set.size(); ++i) { 1184 UChar32 c = set.charAt(i); 1185 if (set.indexOf(c) != i) { 1186 errln("FAIL: charAt(%d) = %X => indexOf() => %d", 1187 i, c, set.indexOf(c)); 1188 } 1189 } 1190 UChar32 c = set.charAt(set.size()); 1191 if (c != -1) { 1192 errln("FAIL: charAt(<out of range>) = %X", c); 1193 } 1194 int32_t j = set.indexOf((UChar32)0x71/*'q'*/); 1195 if (j != -1) { 1196 errln((UnicodeString)"FAIL: indexOf('q') = " + j); 1197 } 1198 } 1199 1200 /** 1201 * Test closure API. 1202 */ 1203 void UnicodeSetTest::TestCloseOver() { 1204 UErrorCode ec = U_ZERO_ERROR; 1205 1206 char CASE[] = {(char)USET_CASE_INSENSITIVE}; 1207 char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS}; 1208 const char* DATA[] = { 1209 // selector, input, output 1210 CASE, 1211 "[aq\\u00DF{Bc}{bC}{Fi}]", 1212 "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]", // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1 1213 1214 CASE, 1215 "[\\u01F1]", // 'DZ' 1216 "[\\u01F1\\u01F2\\u01F3]", 1217 1218 CASE, 1219 "[\\u1FB4]", 1220 "[\\u1FB4{\\u03AC\\u03B9}]", 1221 1222 CASE, 1223 "[{F\\uFB01}]", 1224 "[\\uFB03{ffi}]", 1225 1226 CASE, // make sure binary search finds limits 1227 "[a\\uFF3A]", 1228 "[aA\\uFF3A\\uFF5A]", 1229 1230 CASE, 1231 "[a-z]","[A-Za-z\\u017F\\u212A]", 1232 CASE, 1233 "[abc]","[A-Ca-c]", 1234 CASE, 1235 "[ABC]","[A-Ca-c]", 1236 1237 CASE, "[i]", "[iI]", 1238 1239 CASE, "[\\u0130]", "[\\u0130{i\\u0307}]", // dotted I 1240 CASE, "[{i\\u0307}]", "[\\u0130{i\\u0307}]", // i with dot 1241 1242 CASE, "[\\u0131]", "[\\u0131]", // dotless i 1243 1244 CASE, "[\\u0390]", "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]", 1245 1246 CASE, "[\\u03c2]", "[\\u03a3\\u03c2\\u03c3]", // sigmas 1247 1248 CASE, "[\\u03f2]", "[\\u03f2\\u03f9]", // lunate sigmas 1249 1250 CASE, "[\\u03f7]", "[\\u03f7\\u03f8]", 1251 1252 CASE, "[\\u1fe3]", "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]", 1253 1254 CASE, "[\\ufb05]", "[\\ufb05\\ufb06{st}]", 1255 CASE, "[{st}]", "[\\ufb05\\ufb06{st}]", 1256 1257 CASE, "[\\U0001044F]", "[\\U00010427\\U0001044F]", 1258 1259 CASE, "[{a\\u02BE}]", "[\\u1E9A{a\\u02BE}]", // first in sorted table 1260 1261 CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table 1262 1263 #if !UCONFIG_NO_FILE_IO 1264 CASE_MAPPINGS, 1265 "[aq\\u00DF{Bc}{bC}{Fi}]", 1266 "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]", 1267 #endif 1268 1269 CASE_MAPPINGS, 1270 "[\\u01F1]", // 'DZ' 1271 "[\\u01F1\\u01F2\\u01F3]", 1272 1273 CASE_MAPPINGS, 1274 "[a-z]", 1275 "[A-Za-z]", 1276 1277 NULL 1278 }; 1279 1280 UnicodeSet s; 1281 UnicodeSet t; 1282 UnicodeString buf; 1283 for (int32_t i=0; DATA[i]!=NULL; i+=3) { 1284 int32_t selector = DATA[i][0]; 1285 UnicodeString pat(DATA[i+1], -1, US_INV); 1286 UnicodeString exp(DATA[i+2], -1, US_INV); 1287 s.applyPattern(pat, ec); 1288 s.closeOver(selector); 1289 t.applyPattern(exp, ec); 1290 if (U_FAILURE(ec)) { 1291 errln("FAIL: applyPattern failed"); 1292 continue; 1293 } 1294 if (s == t) { 1295 logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp); 1296 } else { 1297 dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " + 1298 s.toPattern(buf, TRUE) + ", expected " + exp); 1299 } 1300 } 1301 1302 #if 0 1303 /* 1304 * Unused test code. 1305 * This was used to compare the old implementation (using USET_CASE) 1306 * with the new one (using 0x100 temporarily) 1307 * while transitioning from hardcoded case closure tables in uniset.cpp 1308 * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu. 1309 * and using ucase.c functions for closure. 1310 * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file 1311 * 1312 * Note: The old and new implementation never fully matched because 1313 * the old implementation turned out to not map U+0130 and U+0131 correctly 1314 * (dotted I and dotless i) and because the old implementation's data tables 1315 * were outdated compared to Unicode 4.0.1 at the time of the change to the 1316 * new implementation. (So sigmas and some other characters were not handled 1317 * according to the newer Unicode version.) 1318 */ 1319 UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2; 1320 UnicodeSetIterator si(sens); 1321 UnicodeString str, buf2; 1322 const UnicodeString *pStr; 1323 UChar32 c; 1324 while(si.next()) { 1325 if(!si.isString()) { 1326 c=si.getCodepoint(); 1327 s.clear(); 1328 s.add(c); 1329 1330 str.setTo(c); 1331 str.foldCase(); 1332 sens2.add(str); 1333 1334 t=s; 1335 s.closeOver(USET_CASE); 1336 t.closeOver(0x100); 1337 if(s!=t) { 1338 errln("FAIL: closeOver(U+%04x) differs: ", c); 1339 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE)); 1340 } 1341 } 1342 } 1343 // remove all code points 1344 // should contain all full case folding mapping strings 1345 sens2.remove(0, 0x10ffff); 1346 si.reset(sens2); 1347 while(si.next()) { 1348 if(si.isString()) { 1349 pStr=&si.getString(); 1350 s.clear(); 1351 s.add(*pStr); 1352 t=s2=s; 1353 s.closeOver(USET_CASE); 1354 t.closeOver(0x100); 1355 if(s!=t) { 1356 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: "); 1357 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE)); 1358 } 1359 } 1360 } 1361 #endif 1362 1363 // Test the pattern API 1364 s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec); 1365 if (U_FAILURE(ec)) { 1366 errln("FAIL: applyPattern failed"); 1367 } else { 1368 expectContainment(s, "abcABC", "defDEF"); 1369 } 1370 UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec); 1371 if (U_FAILURE(ec)) { 1372 errln("FAIL: constructor failed"); 1373 } else { 1374 expectContainment(v, "defDEF", "abcABC"); 1375 } 1376 UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec); 1377 if (U_FAILURE(ec)) { 1378 errln("FAIL: construct w/case mappings failed"); 1379 } else { 1380 expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A")); 1381 } 1382 } 1383 1384 void UnicodeSetTest::TestEscapePattern() { 1385 const char pattern[] = 1386 "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]"; 1387 const char exp[] = 1388 "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]"; 1389 // We test this with two passes; in the second pass we 1390 // pre-unescape the pattern. Since U+200E is Pattern_White_Space, 1391 // this fails -- which is what we expect. 1392 for (int32_t pass=1; pass<=2; ++pass) { 1393 UErrorCode ec = U_ZERO_ERROR; 1394 UnicodeString pat(pattern, -1, US_INV); 1395 if (pass==2) { 1396 pat = pat.unescape(); 1397 } 1398 // Pattern is only good for pass 1 1399 UBool isPatternValid = (pass==1); 1400 1401 UnicodeSet set(pat, ec); 1402 if (U_SUCCESS(ec) != isPatternValid){ 1403 errln((UnicodeString)"FAIL: applyPattern(" + 1404 escape(pat) + ") => " + 1405 u_errorName(ec)); 1406 continue; 1407 } 1408 if (U_FAILURE(ec)) { 1409 continue; 1410 } 1411 if (set.contains((UChar)0x0644)){ 1412 errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)"); 1413 } 1414 1415 UnicodeString newpat; 1416 set.toPattern(newpat, TRUE); 1417 if (newpat == UnicodeString(exp, -1, US_INV)) { 1418 logln(escape(pat) + " => " + newpat); 1419 } else { 1420 errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat); 1421 } 1422 1423 for (int32_t i=0; i<set.getRangeCount(); ++i) { 1424 UnicodeString str("Range "); 1425 str.append((UChar)(0x30 + i)) 1426 .append(": ") 1427 .append((UChar32)set.getRangeStart(i)) 1428 .append(" - ") 1429 .append((UChar32)set.getRangeEnd(i)); 1430 str = str + " (" + set.getRangeStart(i) + " - " + 1431 set.getRangeEnd(i) + ")"; 1432 if (set.getRangeStart(i) < 0) { 1433 errln((UnicodeString)"FAIL: " + escape(str)); 1434 } else { 1435 logln(escape(str)); 1436 } 1437 } 1438 } 1439 } 1440 1441 void UnicodeSetTest::expectRange(const UnicodeString& label, 1442 const UnicodeSet& set, 1443 UChar32 start, UChar32 end) { 1444 UnicodeSet exp(start, end); 1445 UnicodeString pat; 1446 if (set == exp) { 1447 logln(label + " => " + set.toPattern(pat, TRUE)); 1448 } else { 1449 UnicodeString xpat; 1450 errln((UnicodeString)"FAIL: " + label + " => " + 1451 set.toPattern(pat, TRUE) + 1452 ", expected " + exp.toPattern(xpat, TRUE)); 1453 } 1454 } 1455 1456 void UnicodeSetTest::TestInvalidCodePoint() { 1457 1458 const UChar32 DATA[] = { 1459 // Test range Expected range 1460 0, 0x10FFFF, 0, 0x10FFFF, 1461 (UChar32)-1, 8, 0, 8, 1462 8, 0x110000, 8, 0x10FFFF 1463 }; 1464 const int32_t DATA_LENGTH = sizeof(DATA)/sizeof(DATA[0]); 1465 1466 UnicodeString pat; 1467 int32_t i; 1468 1469 for (i=0; i<DATA_LENGTH; i+=4) { 1470 UChar32 start = DATA[i]; 1471 UChar32 end = DATA[i+1]; 1472 UChar32 xstart = DATA[i+2]; 1473 UChar32 xend = DATA[i+3]; 1474 1475 // Try various API using the test code points 1476 1477 UnicodeSet set(start, end); 1478 expectRange((UnicodeString)"ct(" + start + "," + end + ")", 1479 set, xstart, xend); 1480 1481 set.clear(); 1482 set.set(start, end); 1483 expectRange((UnicodeString)"set(" + start + "," + end + ")", 1484 set, xstart, xend); 1485 1486 UBool b = set.contains(start); 1487 b = set.contains(start, end); 1488 b = set.containsNone(start, end); 1489 b = set.containsSome(start, end); 1490 (void)b; // Suppress set but not used warning. 1491 1492 /*int32_t index = set.indexOf(start);*/ 1493 1494 set.clear(); 1495 set.add(start); 1496 set.add(start, end); 1497 expectRange((UnicodeString)"add(" + start + "," + end + ")", 1498 set, xstart, xend); 1499 1500 set.set(0, 0x10FFFF); 1501 set.retain(start, end); 1502 expectRange((UnicodeString)"retain(" + start + "," + end + ")", 1503 set, xstart, xend); 1504 set.retain(start); 1505 1506 set.set(0, 0x10FFFF); 1507 set.remove(start); 1508 set.remove(start, end); 1509 set.complement(); 1510 expectRange((UnicodeString)"!remove(" + start + "," + end + ")", 1511 set, xstart, xend); 1512 1513 set.set(0, 0x10FFFF); 1514 set.complement(start, end); 1515 set.complement(); 1516 expectRange((UnicodeString)"!complement(" + start + "," + end + ")", 1517 set, xstart, xend); 1518 set.complement(start); 1519 } 1520 1521 const UChar32 DATA2[] = { 1522 0, 1523 0x10FFFF, 1524 (UChar32)-1, 1525 0x110000 1526 }; 1527 const int32_t DATA2_LENGTH = sizeof(DATA2)/sizeof(DATA2[0]); 1528 1529 for (i=0; i<DATA2_LENGTH; ++i) { 1530 UChar32 c = DATA2[i], end = 0x10FFFF; 1531 UBool valid = (c >= 0 && c <= 0x10FFFF); 1532 1533 UnicodeSet set(0, 0x10FFFF); 1534 1535 // For single-codepoint contains, invalid codepoints are NOT contained 1536 UBool b = set.contains(c); 1537 if (b == valid) { 1538 logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c + 1539 ") = " + b); 1540 } else { 1541 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c + 1542 ") = " + b); 1543 } 1544 1545 // For codepoint range contains, containsNone, and containsSome, 1546 // invalid or empty (start > end) ranges have UNDEFINED behavior. 1547 b = set.contains(c, end); 1548 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c + 1549 "," + end + ") = " + b); 1550 1551 b = set.containsNone(c, end); 1552 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c + 1553 "," + end + ") = " + b); 1554 1555 b = set.containsSome(c, end); 1556 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c + 1557 "," + end + ") = " + b); 1558 1559 int32_t index = set.indexOf(c); 1560 if ((index >= 0) == valid) { 1561 logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c + 1562 ") = " + index); 1563 } else { 1564 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c + 1565 ") = " + index); 1566 } 1567 } 1568 } 1569 1570 // Used by TestSymbolTable 1571 class TokenSymbolTable : public SymbolTable { 1572 public: 1573 Hashtable contents; 1574 1575 TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) { 1576 contents.setValueDeleter(uprv_deleteUObject); 1577 } 1578 1579 ~TokenSymbolTable() {} 1580 1581 /** 1582 * (Non-SymbolTable API) Add the given variable and value to 1583 * the table. Variable should NOT contain leading '$'. 1584 */ 1585 void add(const UnicodeString& var, const UnicodeString& value, 1586 UErrorCode& ec) { 1587 if (U_SUCCESS(ec)) { 1588 contents.put(var, new UnicodeString(value), ec); 1589 } 1590 } 1591 1592 /** 1593 * SymbolTable API 1594 */ 1595 virtual const UnicodeString* lookup(const UnicodeString& s) const { 1596 return (const UnicodeString*) contents.get(s); 1597 } 1598 1599 /** 1600 * SymbolTable API 1601 */ 1602 virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const { 1603 return NULL; 1604 } 1605 1606 /** 1607 * SymbolTable API 1608 */ 1609 virtual UnicodeString parseReference(const UnicodeString& text, 1610 ParsePosition& pos, int32_t limit) const { 1611 int32_t start = pos.getIndex(); 1612 int32_t i = start; 1613 UnicodeString result; 1614 while (i < limit) { 1615 UChar c = text.charAt(i); 1616 if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) { 1617 break; 1618 } 1619 ++i; 1620 } 1621 if (i == start) { // No valid name chars 1622 return result; // Indicate failure with empty string 1623 } 1624 pos.setIndex(i); 1625 text.extractBetween(start, i, result); 1626 return result; 1627 } 1628 }; 1629 1630 void UnicodeSetTest::TestSymbolTable() { 1631 // Multiple test cases can be set up here. Each test case 1632 // is terminated by null: 1633 // var, value, var, value,..., input pat., exp. output pat., null 1634 const char* DATA[] = { 1635 "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL, 1636 "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL, 1637 "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL, 1638 NULL 1639 }; 1640 1641 for (int32_t i=0; DATA[i]!=NULL; ++i) { 1642 UErrorCode ec = U_ZERO_ERROR; 1643 TokenSymbolTable sym(ec); 1644 if (U_FAILURE(ec)) { 1645 errln("FAIL: couldn't construct TokenSymbolTable"); 1646 continue; 1647 } 1648 1649 // Set up variables 1650 while (DATA[i+2] != NULL) { 1651 sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec); 1652 if (U_FAILURE(ec)) { 1653 errln("FAIL: couldn't add to TokenSymbolTable"); 1654 continue; 1655 } 1656 i += 2; 1657 } 1658 1659 // Input pattern and expected output pattern 1660 UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV); 1661 i += 2; 1662 1663 ParsePosition pos(0); 1664 UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec); 1665 if (U_FAILURE(ec)) { 1666 errln("FAIL: couldn't construct UnicodeSet"); 1667 continue; 1668 } 1669 1670 // results 1671 if (pos.getIndex() != inpat.length()) { 1672 errln((UnicodeString)"Failed to read to end of string \"" 1673 + inpat + "\": read to " 1674 + pos.getIndex() + ", length is " 1675 + inpat.length()); 1676 } 1677 1678 UnicodeSet us2(exppat, ec); 1679 if (U_FAILURE(ec)) { 1680 errln("FAIL: couldn't construct expected UnicodeSet"); 1681 continue; 1682 } 1683 1684 UnicodeString a, b; 1685 if (us != us2) { 1686 errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) + 1687 ", expected " + us2.toPattern(b, TRUE)); 1688 } else { 1689 logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE)); 1690 } 1691 } 1692 } 1693 1694 void UnicodeSetTest::TestSurrogate() { 1695 const char* DATA[] = { 1696 // These should all behave identically 1697 "[abc\\uD800\\uDC00]", 1698 // "[abc\uD800\uDC00]", // Can't do this on C -- only Java 1699 "[abc\\U00010000]", 1700 0 1701 }; 1702 for (int i=0; DATA[i] != 0; ++i) { 1703 UErrorCode ec = U_ZERO_ERROR; 1704 logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV)); 1705 UnicodeString str = UnicodeString(DATA[i], -1, US_INV); 1706 UnicodeSet set(str, ec); 1707 if (U_FAILURE(ec)) { 1708 errln("FAIL: UnicodeSet constructor"); 1709 continue; 1710 } 1711 expectContainment(set, 1712 CharsToUnicodeString("abc\\U00010000"), 1713 CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair 1714 if (set.size() != 4) { 1715 errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " + 1716 set.size() + ", expected 4"); 1717 } 1718 1719 { 1720 UErrorCode subErr = U_ZERO_ERROR; 1721 checkRoundTrip(set); 1722 checkSerializeRoundTrip(set, subErr); 1723 } 1724 } 1725 } 1726 1727 void UnicodeSetTest::TestExhaustive() { 1728 // exhaustive tests. Simulate UnicodeSets with integers. 1729 // That gives us very solid tests (except for large memory tests). 1730 1731 int32_t limit = 128; 1732 1733 UnicodeSet x, y, z, aa; 1734 1735 for (int32_t i = 0; i < limit; ++i) { 1736 bitsToSet(i, x); 1737 logln((UnicodeString)"Testing " + i + ", " + x); 1738 _testComplement(i, x, y); 1739 1740 UnicodeSet &toTest = bitsToSet(i, aa); 1741 1742 // AS LONG AS WE ARE HERE, check roundtrip 1743 checkRoundTrip(toTest); 1744 UErrorCode ec = U_ZERO_ERROR; 1745 checkSerializeRoundTrip(toTest, ec); 1746 1747 for (int32_t j = 0; j < limit; ++j) { 1748 _testAdd(i,j, x,y,z); 1749 _testXor(i,j, x,y,z); 1750 _testRetain(i,j, x,y,z); 1751 _testRemove(i,j, x,y,z); 1752 } 1753 } 1754 } 1755 1756 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) { 1757 bitsToSet(a, x); 1758 z = x; 1759 z.complement(); 1760 int32_t c = setToBits(z); 1761 if (c != (~a)) { 1762 errln((UnicodeString)"FAILED: add: ~" + x + " != " + z); 1763 errln((UnicodeString)"FAILED: add: ~" + a + " != " + c); 1764 } 1765 checkCanonicalRep(z, (UnicodeString)"complement " + a); 1766 } 1767 1768 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) { 1769 bitsToSet(a, x); 1770 bitsToSet(b, y); 1771 z = x; 1772 z.addAll(y); 1773 int32_t c = setToBits(z); 1774 if (c != (a | b)) { 1775 errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z); 1776 errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c); 1777 } 1778 checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b); 1779 } 1780 1781 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) { 1782 bitsToSet(a, x); 1783 bitsToSet(b, y); 1784 z = x; 1785 z.retainAll(y); 1786 int32_t c = setToBits(z); 1787 if (c != (a & b)) { 1788 errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z); 1789 errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c); 1790 } 1791 checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b); 1792 } 1793 1794 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) { 1795 bitsToSet(a, x); 1796 bitsToSet(b, y); 1797 z = x; 1798 z.removeAll(y); 1799 int32_t c = setToBits(z); 1800 if (c != (a &~ b)) { 1801 errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z); 1802 errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c); 1803 } 1804 checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b); 1805 } 1806 1807 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) { 1808 bitsToSet(a, x); 1809 bitsToSet(b, y); 1810 z = x; 1811 z.complementAll(y); 1812 int32_t c = setToBits(z); 1813 if (c != (a ^ b)) { 1814 errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z); 1815 errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c); 1816 } 1817 checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b); 1818 } 1819 1820 /** 1821 * Check that ranges are monotonically increasing and non- 1822 * overlapping. 1823 */ 1824 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) { 1825 int32_t n = set.getRangeCount(); 1826 if (n < 0) { 1827 errln((UnicodeString)"FAIL result of " + msg + 1828 ": range count should be >= 0 but is " + 1829 n /*+ " for " + set.toPattern())*/); 1830 return; 1831 } 1832 UChar32 last = 0; 1833 for (int32_t i=0; i<n; ++i) { 1834 UChar32 start = set.getRangeStart(i); 1835 UChar32 end = set.getRangeEnd(i); 1836 if (start > end) { 1837 errln((UnicodeString)"FAIL result of " + msg + 1838 ": range " + (i+1) + 1839 " start > end: " + (int)start + ", " + (int)end + 1840 " for " + set); 1841 } 1842 if (i > 0 && start <= last) { 1843 errln((UnicodeString)"FAIL result of " + msg + 1844 ": range " + (i+1) + 1845 " overlaps previous range: " + (int)start + ", " + (int)end + 1846 " for " + set); 1847 } 1848 last = end; 1849 } 1850 } 1851 1852 /** 1853 * Convert a bitmask to a UnicodeSet. 1854 */ 1855 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) { 1856 result.clear(); 1857 for (UChar32 i = 0; i < 32; ++i) { 1858 if ((a & (1<<i)) != 0) { 1859 result.add(i); 1860 } 1861 } 1862 return result; 1863 } 1864 1865 /** 1866 * Convert a UnicodeSet to a bitmask. Only the characters 1867 * U+0000 to U+0020 are represented in the bitmask. 1868 */ 1869 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) { 1870 int32_t result = 0; 1871 for (int32_t i = 0; i < 32; ++i) { 1872 if (x.contains((UChar32)i)) { 1873 result |= (1<<i); 1874 } 1875 } 1876 return result; 1877 } 1878 1879 /** 1880 * Return the representation of an inversion list based UnicodeSet 1881 * as a pairs list. Ranges are listed in ascending Unicode order. 1882 * For example, the set [a-zA-M3] is represented as "33AMaz". 1883 */ 1884 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) { 1885 UnicodeString pairs; 1886 for (int32_t i=0; i<set.getRangeCount(); ++i) { 1887 UChar32 start = set.getRangeStart(i); 1888 UChar32 end = set.getRangeEnd(i); 1889 if (end > 0xFFFF) { 1890 end = 0xFFFF; 1891 i = set.getRangeCount(); // Should be unnecessary 1892 } 1893 pairs.append((UChar)start).append((UChar)end); 1894 } 1895 return pairs; 1896 } 1897 1898 /** 1899 * Basic consistency check for a few items. 1900 * That the iterator works, and that we can create a pattern and 1901 * get the same thing back 1902 */ 1903 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) { 1904 { 1905 UnicodeSet t(s); 1906 checkEqual(s, t, "copy ct"); 1907 } 1908 1909 { 1910 UnicodeSet t(0xabcd, 0xdef0); // dummy contents should be overwritten 1911 t = s; 1912 checkEqual(s, t, "operator="); 1913 } 1914 1915 { 1916 UnicodeSet t; 1917 copyWithIterator(t, s, FALSE); 1918 checkEqual(s, t, "iterator roundtrip"); 1919 } 1920 1921 { 1922 UnicodeSet t; 1923 copyWithIterator(t, s, TRUE); // try range 1924 checkEqual(s, t, "iterator roundtrip"); 1925 } 1926 1927 { 1928 UnicodeSet t; 1929 UnicodeString pat; 1930 UErrorCode ec = U_ZERO_ERROR; 1931 s.toPattern(pat, FALSE); 1932 t.applyPattern(pat, ec); 1933 if (U_FAILURE(ec)) { 1934 errln("FAIL: toPattern(escapeUnprintable=FALSE), applyPattern - %s", u_errorName(ec)); 1935 return; 1936 } else { 1937 checkEqual(s, t, "toPattern(false)"); 1938 } 1939 } 1940 1941 { 1942 UnicodeSet t; 1943 UnicodeString pat; 1944 UErrorCode ec = U_ZERO_ERROR; 1945 s.toPattern(pat, TRUE); 1946 t.applyPattern(pat, ec); 1947 if (U_FAILURE(ec)) { 1948 errln("FAIL: toPattern(escapeUnprintable=TRUE), applyPattern - %s", u_errorName(ec)); 1949 return; 1950 } else { 1951 checkEqual(s, t, "toPattern(true)"); 1952 } 1953 } 1954 } 1955 1956 void UnicodeSetTest::checkSerializeRoundTrip(const UnicodeSet& t, UErrorCode &status) { 1957 if(U_FAILURE(status)) return; 1958 int32_t len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status); 1959 if(status == U_BUFFER_OVERFLOW_ERROR) { 1960 status = U_ZERO_ERROR; 1961 serializeBuffer.resize(len); 1962 len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status); 1963 // let 2nd error stand 1964 } 1965 if(U_FAILURE(status)) { 1966 errln("checkSerializeRoundTrip: error %s serializing buffer\n", u_errorName(status)); 1967 return; 1968 } 1969 UnicodeSet deserialized(serializeBuffer.getAlias(), len, UnicodeSet::kSerialized, status); 1970 if(U_FAILURE(status)) { 1971 errln("checkSerializeRoundTrip: error %s deserializing buffer: buf %p len %d, original %d\n", u_errorName(status), serializeBuffer.getAlias(), len, t.getRangeCount()); 1972 return; 1973 } 1974 1975 checkEqual(t, deserialized, "Set was unequal when deserialized"); 1976 } 1977 1978 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) { 1979 t.clear(); 1980 UnicodeSetIterator it(s); 1981 if (withRange) { 1982 while (it.nextRange()) { 1983 if (it.isString()) { 1984 t.add(it.getString()); 1985 } else { 1986 t.add(it.getCodepoint(), it.getCodepointEnd()); 1987 } 1988 } 1989 } else { 1990 while (it.next()) { 1991 if (it.isString()) { 1992 t.add(it.getString()); 1993 } else { 1994 t.add(it.getCodepoint()); 1995 } 1996 } 1997 } 1998 } 1999 2000 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) { 2001 assertEquals(UnicodeString("RangeCount: ","") + message, s.getRangeCount(), t.getRangeCount()); 2002 assertEquals(UnicodeString("size: ","") + message, s.size(), t.size()); 2003 UnicodeString source; s.toPattern(source, TRUE); 2004 UnicodeString result; t.toPattern(result, TRUE); 2005 if (s != t) { 2006 errln((UnicodeString)"FAIL: " + message 2007 + "; source = " + source 2008 + "; result = " + result 2009 ); 2010 return FALSE; 2011 } else { 2012 logln((UnicodeString)"Ok: " + message 2013 + "; source = " + source 2014 + "; result = " + result 2015 ); 2016 } 2017 return TRUE; 2018 } 2019 2020 void 2021 UnicodeSetTest::expectContainment(const UnicodeString& pat, 2022 const UnicodeString& charsIn, 2023 const UnicodeString& charsOut) { 2024 UErrorCode ec = U_ZERO_ERROR; 2025 UnicodeSet set(pat, ec); 2026 if (U_FAILURE(ec)) { 2027 dataerrln((UnicodeString)"FAIL: pattern \"" + 2028 pat + "\" => " + u_errorName(ec)); 2029 return; 2030 } 2031 expectContainment(set, pat, charsIn, charsOut); 2032 } 2033 2034 void 2035 UnicodeSetTest::expectContainment(const UnicodeSet& set, 2036 const UnicodeString& charsIn, 2037 const UnicodeString& charsOut) { 2038 UnicodeString pat; 2039 set.toPattern(pat); 2040 expectContainment(set, pat, charsIn, charsOut); 2041 } 2042 2043 void 2044 UnicodeSetTest::expectContainment(const UnicodeSet& set, 2045 const UnicodeString& setName, 2046 const UnicodeString& charsIn, 2047 const UnicodeString& charsOut) { 2048 UnicodeString bad; 2049 UChar32 c; 2050 int32_t i; 2051 2052 for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) { 2053 c = charsIn.char32At(i); 2054 if (!set.contains(c)) { 2055 bad.append(c); 2056 } 2057 } 2058 if (bad.length() > 0) { 2059 errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) + 2060 ", expected containment of " + prettify(charsIn)); 2061 } else { 2062 logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn)); 2063 } 2064 2065 bad.truncate(0); 2066 for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) { 2067 c = charsOut.char32At(i); 2068 if (set.contains(c)) { 2069 bad.append(c); 2070 } 2071 } 2072 if (bad.length() > 0) { 2073 errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) + 2074 ", expected non-containment of " + prettify(charsOut)); 2075 } else { 2076 logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut)); 2077 } 2078 } 2079 2080 void 2081 UnicodeSetTest::expectPattern(UnicodeSet& set, 2082 const UnicodeString& pattern, 2083 const UnicodeString& expectedPairs){ 2084 UErrorCode status = U_ZERO_ERROR; 2085 set.applyPattern(pattern, status); 2086 if (U_FAILURE(status)) { 2087 errln(UnicodeString("FAIL: applyPattern(\"") + pattern + 2088 "\") failed"); 2089 return; 2090 } else { 2091 if (getPairs(set) != expectedPairs ) { 2092 errln(UnicodeString("FAIL: applyPattern(\"") + pattern + 2093 "\") => pairs \"" + 2094 escape(getPairs(set)) + "\", expected \"" + 2095 escape(expectedPairs) + "\""); 2096 } else { 2097 logln(UnicodeString("Ok: applyPattern(\"") + pattern + 2098 "\") => pairs \"" + 2099 escape(getPairs(set)) + "\""); 2100 } 2101 } 2102 // the result of calling set.toPattern(), which is the string representation of 2103 // this set(set), is passed to a UnicodeSet constructor, and tested that it 2104 // will produce another set that is equal to this one. 2105 UnicodeString temppattern; 2106 set.toPattern(temppattern); 2107 UnicodeSet *tempset=new UnicodeSet(temppattern, status); 2108 if (U_FAILURE(status)) { 2109 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern")); 2110 return; 2111 } 2112 if(*tempset != set || getPairs(*tempset) != getPairs(set)){ 2113 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" + 2114 escape(getPairs(set)) + "\"")); 2115 } else{ 2116 logln(UnicodeString("Ok: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\"")); 2117 } 2118 2119 delete tempset; 2120 2121 } 2122 2123 void 2124 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) { 2125 if (getPairs(set) != expectedPairs) { 2126 errln(UnicodeString("FAIL: Expected pair list \"") + 2127 escape(expectedPairs) + "\", got \"" + 2128 escape(getPairs(set)) + "\""); 2129 } 2130 } 2131 2132 void UnicodeSetTest::expectToPattern(const UnicodeSet& set, 2133 const UnicodeString& expPat, 2134 const char** expStrings) { 2135 UnicodeString pat; 2136 set.toPattern(pat, TRUE); 2137 if (pat == expPat) { 2138 logln((UnicodeString)"Ok: toPattern() => \"" + pat + "\""); 2139 } else { 2140 errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\""); 2141 return; 2142 } 2143 if (expStrings == NULL) { 2144 return; 2145 } 2146 UBool in = TRUE; 2147 for (int32_t i=0; expStrings[i] != NULL; ++i) { 2148 if (expStrings[i] == NOT) { // sic; pointer comparison 2149 in = FALSE; 2150 continue; 2151 } 2152 UnicodeString s = CharsToUnicodeString(expStrings[i]); 2153 UBool contained = set.contains(s); 2154 if (contained == in) { 2155 logln((UnicodeString)"Ok: " + expPat + 2156 (contained ? " contains {" : " does not contain {") + 2157 escape(expStrings[i]) + "}"); 2158 } else { 2159 errln((UnicodeString)"FAIL: " + expPat + 2160 (contained ? " contains {" : " does not contain {") + 2161 escape(expStrings[i]) + "}"); 2162 } 2163 } 2164 } 2165 2166 static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); } 2167 2168 void 2169 UnicodeSetTest::doAssert(UBool condition, const char *message) 2170 { 2171 if (!condition) { 2172 errln(UnicodeString("ERROR : ") + message); 2173 } 2174 } 2175 2176 UnicodeString 2177 UnicodeSetTest::escape(const UnicodeString& s) { 2178 UnicodeString buf; 2179 for (int32_t i=0; i<s.length(); ) 2180 { 2181 UChar32 c = s.char32At(i); 2182 if (0x0020 <= c && c <= 0x007F) { 2183 buf += c; 2184 } else { 2185 if (c <= 0xFFFF) { 2186 buf += (UChar)0x5c; buf += (UChar)0x75; 2187 } else { 2188 buf += (UChar)0x5c; buf += (UChar)0x55; 2189 buf += toHexString((c & 0xF0000000) >> 28); 2190 buf += toHexString((c & 0x0F000000) >> 24); 2191 buf += toHexString((c & 0x00F00000) >> 20); 2192 buf += toHexString((c & 0x000F0000) >> 16); 2193 } 2194 buf += toHexString((c & 0xF000) >> 12); 2195 buf += toHexString((c & 0x0F00) >> 8); 2196 buf += toHexString((c & 0x00F0) >> 4); 2197 buf += toHexString(c & 0x000F); 2198 } 2199 i += U16_LENGTH(c); 2200 } 2201 return buf; 2202 } 2203 2204 void UnicodeSetTest::TestFreezable() { 2205 UErrorCode errorCode=U_ZERO_ERROR; 2206 UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15); 2207 UnicodeSet idSet(idPattern, errorCode); 2208 if(U_FAILURE(errorCode)) { 2209 dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode)); 2210 return; 2211 } 2212 2213 UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15); 2214 UnicodeSet wsSet(wsPattern, errorCode); 2215 if(U_FAILURE(errorCode)) { 2216 dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode)); 2217 return; 2218 } 2219 2220 idSet.add(idPattern); 2221 UnicodeSet frozen(idSet); 2222 frozen.freeze(); 2223 2224 if(idSet.isFrozen() || !frozen.isFrozen()) { 2225 errln("FAIL: isFrozen() is wrong"); 2226 } 2227 if(frozen!=idSet || !(frozen==idSet)) { 2228 errln("FAIL: a copy-constructed frozen set differs from its original"); 2229 } 2230 2231 frozen=wsSet; 2232 if(frozen!=idSet || !(frozen==idSet)) { 2233 errln("FAIL: a frozen set was modified by operator="); 2234 } 2235 2236 UnicodeSet frozen2(frozen); 2237 if(frozen2!=frozen || frozen2!=idSet) { 2238 errln("FAIL: a copied frozen set differs from its frozen original"); 2239 } 2240 if(!frozen2.isFrozen()) { 2241 errln("FAIL: copy-constructing a frozen set results in a thawed one"); 2242 } 2243 UnicodeSet frozen3(5, 55); // Set to some values to really test assignment below, not copy construction. 2244 if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) { 2245 errln("FAIL: UnicodeSet(5, 55) failed"); 2246 } 2247 frozen3=frozen; 2248 if(!frozen3.isFrozen()) { 2249 errln("FAIL: copying a frozen set results in a thawed one"); 2250 } 2251 2252 UnicodeSet *cloned=(UnicodeSet *)frozen.clone(); 2253 if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) { 2254 errln("FAIL: clone() failed"); 2255 } 2256 cloned->add(0xd802, 0xd805); 2257 if(cloned->containsSome(0xd802, 0xd805)) { 2258 errln("FAIL: unable to modify clone"); 2259 } 2260 delete cloned; 2261 2262 UnicodeSet *thawed=(UnicodeSet *)frozen.cloneAsThawed(); 2263 if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) { 2264 errln("FAIL: cloneAsThawed() failed"); 2265 } 2266 thawed->add(0xd802, 0xd805); 2267 if(!thawed->contains(0xd802, 0xd805)) { 2268 errln("FAIL: unable to modify thawed clone"); 2269 } 2270 delete thawed; 2271 2272 frozen.set(5, 55); 2273 if(frozen!=idSet || !(frozen==idSet)) { 2274 errln("FAIL: UnicodeSet::set() modified a frozen set"); 2275 } 2276 2277 frozen.clear(); 2278 if(frozen!=idSet || !(frozen==idSet)) { 2279 errln("FAIL: UnicodeSet::clear() modified a frozen set"); 2280 } 2281 2282 frozen.closeOver(USET_CASE_INSENSITIVE); 2283 if(frozen!=idSet || !(frozen==idSet)) { 2284 errln("FAIL: UnicodeSet::closeOver() modified a frozen set"); 2285 } 2286 2287 frozen.compact(); 2288 if(frozen!=idSet || !(frozen==idSet)) { 2289 errln("FAIL: UnicodeSet::compact() modified a frozen set"); 2290 } 2291 2292 ParsePosition pos; 2293 frozen. 2294 applyPattern(wsPattern, errorCode). 2295 applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode). 2296 applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode). 2297 applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode). 2298 applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode); 2299 if(frozen!=idSet || !(frozen==idSet)) { 2300 errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set"); 2301 } 2302 2303 frozen. 2304 add(0xd800). 2305 add(0xd802, 0xd805). 2306 add(wsPattern). 2307 addAll(idPattern). 2308 addAll(wsSet); 2309 if(frozen!=idSet || !(frozen==idSet)) { 2310 errln("FAIL: UnicodeSet::addXYZ() modified a frozen set"); 2311 } 2312 2313 frozen. 2314 retain(0x62). 2315 retain(0x64, 0x69). 2316 retainAll(wsPattern). 2317 retainAll(wsSet); 2318 if(frozen!=idSet || !(frozen==idSet)) { 2319 errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set"); 2320 } 2321 2322 frozen. 2323 remove(0x62). 2324 remove(0x64, 0x69). 2325 remove(idPattern). 2326 removeAll(idPattern). 2327 removeAll(idSet); 2328 if(frozen!=idSet || !(frozen==idSet)) { 2329 errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set"); 2330 } 2331 2332 frozen. 2333 complement(). 2334 complement(0x62). 2335 complement(0x64, 0x69). 2336 complement(idPattern). 2337 complementAll(idPattern). 2338 complementAll(idSet); 2339 if(frozen!=idSet || !(frozen==idSet)) { 2340 errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set"); 2341 } 2342 } 2343 2344 // Test span() etc. -------------------------------------------------------- *** 2345 2346 // Append the UTF-8 version of the string to t and return the appended UTF-8 length. 2347 static int32_t 2348 appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) { 2349 UErrorCode errorCode=U_ZERO_ERROR; 2350 int32_t length8=0; 2351 u_strToUTF8(t, capacity, &length8, s, length, &errorCode); 2352 if(U_SUCCESS(errorCode)) { 2353 return length8; 2354 } else { 2355 // The string contains an unpaired surrogate. 2356 // Ignore this string. 2357 return 0; 2358 } 2359 } 2360 2361 class UnicodeSetWithStringsIterator; 2362 2363 // Make the strings in a UnicodeSet easily accessible. 2364 class UnicodeSetWithStrings { 2365 public: 2366 UnicodeSetWithStrings(const UnicodeSet &normalSet) : 2367 set(normalSet), stringsLength(0), hasSurrogates(FALSE) { 2368 int32_t size=set.size(); 2369 if(size>0 && set.charAt(size-1)<0) { 2370 // If a set's last element is not a code point, then it must contain strings. 2371 // Iterate over the set, skip all code point ranges, and cache the strings. 2372 // Convert them to UTF-8 for spanUTF8(). 2373 UnicodeSetIterator iter(set); 2374 const UnicodeString *s; 2375 char *s8=utf8; 2376 int32_t length8, utf8Count=0; 2377 while(iter.nextRange() && stringsLength<UPRV_LENGTHOF(strings)) { 2378 if(iter.isString()) { 2379 // Store the pointer to the set's string element 2380 // which we happen to know is a stable pointer. 2381 strings[stringsLength]=s=&iter.getString(); 2382 utf8Count+= 2383 utf8Lengths[stringsLength]=length8= 2384 appendUTF8(s->getBuffer(), s->length(), 2385 s8, (int32_t)(sizeof(utf8)-utf8Count)); 2386 if(length8==0) { 2387 hasSurrogates=TRUE; // Contains unpaired surrogates. 2388 } 2389 s8+=length8; 2390 ++stringsLength; 2391 } 2392 } 2393 } 2394 } 2395 2396 const UnicodeSet &getSet() const { 2397 return set; 2398 } 2399 2400 UBool hasStrings() const { 2401 return (UBool)(stringsLength>0); 2402 } 2403 2404 UBool hasStringsWithSurrogates() const { 2405 return hasSurrogates; 2406 } 2407 2408 private: 2409 friend class UnicodeSetWithStringsIterator; 2410 2411 const UnicodeSet &set; 2412 2413 const UnicodeString *strings[20]; 2414 int32_t stringsLength; 2415 UBool hasSurrogates; 2416 2417 char utf8[1024]; 2418 int32_t utf8Lengths[20]; 2419 }; 2420 2421 class UnicodeSetWithStringsIterator { 2422 public: 2423 UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) : 2424 fSet(set), nextStringIndex(0), nextUTF8Start(0) { 2425 } 2426 2427 void reset() { 2428 nextStringIndex=nextUTF8Start=0; 2429 } 2430 2431 const UnicodeString *nextString() { 2432 if(nextStringIndex<fSet.stringsLength) { 2433 return fSet.strings[nextStringIndex++]; 2434 } else { 2435 return NULL; 2436 } 2437 } 2438 2439 // Do not mix with calls to nextString(). 2440 const char *nextUTF8(int32_t &length) { 2441 if(nextStringIndex<fSet.stringsLength) { 2442 const char *s8=fSet.utf8+nextUTF8Start; 2443 nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++]; 2444 return s8; 2445 } else { 2446 length=0; 2447 return NULL; 2448 } 2449 } 2450 2451 private: 2452 const UnicodeSetWithStrings &fSet; 2453 int32_t nextStringIndex; 2454 int32_t nextUTF8Start; 2455 }; 2456 2457 // Compare 16-bit Unicode strings (which may be malformed UTF-16) 2458 // at code point boundaries. 2459 // That is, each edge of a match must not be in the middle of a surrogate pair. 2460 static inline UBool 2461 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) { 2462 s+=start; 2463 limit-=start; 2464 int32_t length=t.length(); 2465 return 0==t.compare(s, length) && 2466 !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) && 2467 !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length])); 2468 } 2469 2470 // Implement span() with contains() for comparison. 2471 static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length, 2472 USetSpanCondition spanCondition) { 2473 const UnicodeSet &realSet(set.getSet()); 2474 if(!set.hasStrings()) { 2475 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2476 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2477 } 2478 2479 UChar32 c; 2480 int32_t start=0, prev; 2481 while((prev=start)<length) { 2482 U16_NEXT(s, start, length, c); 2483 if(realSet.contains(c)!=spanCondition) { 2484 break; 2485 } 2486 } 2487 return prev; 2488 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) { 2489 UnicodeSetWithStringsIterator iter(set); 2490 UChar32 c; 2491 int32_t start, next; 2492 for(start=next=0; start<length;) { 2493 U16_NEXT(s, next, length, c); 2494 if(realSet.contains(c)) { 2495 break; 2496 } 2497 const UnicodeString *str; 2498 iter.reset(); 2499 while((str=iter.nextString())!=NULL) { 2500 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) { 2501 // spanNeedsStrings=TRUE; 2502 return start; 2503 } 2504 } 2505 start=next; 2506 } 2507 return start; 2508 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ { 2509 UnicodeSetWithStringsIterator iter(set); 2510 UChar32 c; 2511 int32_t start, next, maxSpanLimit=0; 2512 for(start=next=0; start<length;) { 2513 U16_NEXT(s, next, length, c); 2514 if(!realSet.contains(c)) { 2515 next=start; // Do not span this single, not-contained code point. 2516 } 2517 const UnicodeString *str; 2518 iter.reset(); 2519 while((str=iter.nextString())!=NULL) { 2520 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) { 2521 // spanNeedsStrings=TRUE; 2522 int32_t matchLimit=start+str->length(); 2523 if(matchLimit==length) { 2524 return length; 2525 } 2526 if(spanCondition==USET_SPAN_CONTAINED) { 2527 // Iterate for the shortest match at each position. 2528 // Recurse for each but the shortest match. 2529 if(next==start) { 2530 next=matchLimit; // First match from start. 2531 } else { 2532 if(matchLimit<next) { 2533 // Remember shortest match from start for iteration. 2534 int32_t temp=next; 2535 next=matchLimit; 2536 matchLimit=temp; 2537 } 2538 // Recurse for non-shortest match from start. 2539 int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit, 2540 USET_SPAN_CONTAINED); 2541 if((matchLimit+spanLength)>maxSpanLimit) { 2542 maxSpanLimit=matchLimit+spanLength; 2543 if(maxSpanLimit==length) { 2544 return length; 2545 } 2546 } 2547 } 2548 } else /* spanCondition==USET_SPAN_SIMPLE */ { 2549 if(matchLimit>next) { 2550 // Remember longest match from start. 2551 next=matchLimit; 2552 } 2553 } 2554 } 2555 } 2556 if(next==start) { 2557 break; // No match from start. 2558 } 2559 start=next; 2560 } 2561 if(start>maxSpanLimit) { 2562 return start; 2563 } else { 2564 return maxSpanLimit; 2565 } 2566 } 2567 } 2568 2569 static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length, 2570 USetSpanCondition spanCondition) { 2571 if(length==0) { 2572 return 0; 2573 } 2574 const UnicodeSet &realSet(set.getSet()); 2575 if(!set.hasStrings()) { 2576 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2577 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2578 } 2579 2580 UChar32 c; 2581 int32_t prev=length; 2582 do { 2583 U16_PREV(s, 0, length, c); 2584 if(realSet.contains(c)!=spanCondition) { 2585 break; 2586 } 2587 } while((prev=length)>0); 2588 return prev; 2589 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) { 2590 UnicodeSetWithStringsIterator iter(set); 2591 UChar32 c; 2592 int32_t prev=length, length0=length; 2593 do { 2594 U16_PREV(s, 0, length, c); 2595 if(realSet.contains(c)) { 2596 break; 2597 } 2598 const UnicodeString *str; 2599 iter.reset(); 2600 while((str=iter.nextString())!=NULL) { 2601 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) { 2602 // spanNeedsStrings=TRUE; 2603 return prev; 2604 } 2605 } 2606 } while((prev=length)>0); 2607 return prev; 2608 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ { 2609 UnicodeSetWithStringsIterator iter(set); 2610 UChar32 c; 2611 int32_t prev=length, minSpanStart=length, length0=length; 2612 do { 2613 U16_PREV(s, 0, length, c); 2614 if(!realSet.contains(c)) { 2615 length=prev; // Do not span this single, not-contained code point. 2616 } 2617 const UnicodeString *str; 2618 iter.reset(); 2619 while((str=iter.nextString())!=NULL) { 2620 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) { 2621 // spanNeedsStrings=TRUE; 2622 int32_t matchStart=prev-str->length(); 2623 if(matchStart==0) { 2624 return 0; 2625 } 2626 if(spanCondition==USET_SPAN_CONTAINED) { 2627 // Iterate for the shortest match at each position. 2628 // Recurse for each but the shortest match. 2629 if(length==prev) { 2630 length=matchStart; // First match from prev. 2631 } else { 2632 if(matchStart>length) { 2633 // Remember shortest match from prev for iteration. 2634 int32_t temp=length; 2635 length=matchStart; 2636 matchStart=temp; 2637 } 2638 // Recurse for non-shortest match from prev. 2639 int32_t spanStart=containsSpanBackUTF16(set, s, matchStart, 2640 USET_SPAN_CONTAINED); 2641 if(spanStart<minSpanStart) { 2642 minSpanStart=spanStart; 2643 if(minSpanStart==0) { 2644 return 0; 2645 } 2646 } 2647 } 2648 } else /* spanCondition==USET_SPAN_SIMPLE */ { 2649 if(matchStart<length) { 2650 // Remember longest match from prev. 2651 length=matchStart; 2652 } 2653 } 2654 } 2655 } 2656 if(length==prev) { 2657 break; // No match from prev. 2658 } 2659 } while((prev=length)>0); 2660 if(prev<minSpanStart) { 2661 return prev; 2662 } else { 2663 return minSpanStart; 2664 } 2665 } 2666 } 2667 2668 static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length, 2669 USetSpanCondition spanCondition) { 2670 const UnicodeSet &realSet(set.getSet()); 2671 if(!set.hasStrings()) { 2672 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2673 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2674 } 2675 2676 UChar32 c; 2677 int32_t start=0, prev; 2678 while((prev=start)<length) { 2679 U8_NEXT_OR_FFFD(s, start, length, c); 2680 if(realSet.contains(c)!=spanCondition) { 2681 break; 2682 } 2683 } 2684 return prev; 2685 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) { 2686 UnicodeSetWithStringsIterator iter(set); 2687 UChar32 c; 2688 int32_t start, next; 2689 for(start=next=0; start<length;) { 2690 U8_NEXT_OR_FFFD(s, next, length, c); 2691 if(realSet.contains(c)) { 2692 break; 2693 } 2694 const char *s8; 2695 int32_t length8; 2696 iter.reset(); 2697 while((s8=iter.nextUTF8(length8))!=NULL) { 2698 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) { 2699 // spanNeedsStrings=TRUE; 2700 return start; 2701 } 2702 } 2703 start=next; 2704 } 2705 return start; 2706 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ { 2707 UnicodeSetWithStringsIterator iter(set); 2708 UChar32 c; 2709 int32_t start, next, maxSpanLimit=0; 2710 for(start=next=0; start<length;) { 2711 U8_NEXT_OR_FFFD(s, next, length, c); 2712 if(!realSet.contains(c)) { 2713 next=start; // Do not span this single, not-contained code point. 2714 } 2715 const char *s8; 2716 int32_t length8; 2717 iter.reset(); 2718 while((s8=iter.nextUTF8(length8))!=NULL) { 2719 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) { 2720 // spanNeedsStrings=TRUE; 2721 int32_t matchLimit=start+length8; 2722 if(matchLimit==length) { 2723 return length; 2724 } 2725 if(spanCondition==USET_SPAN_CONTAINED) { 2726 // Iterate for the shortest match at each position. 2727 // Recurse for each but the shortest match. 2728 if(next==start) { 2729 next=matchLimit; // First match from start. 2730 } else { 2731 if(matchLimit<next) { 2732 // Remember shortest match from start for iteration. 2733 int32_t temp=next; 2734 next=matchLimit; 2735 matchLimit=temp; 2736 } 2737 // Recurse for non-shortest match from start. 2738 int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit, 2739 USET_SPAN_CONTAINED); 2740 if((matchLimit+spanLength)>maxSpanLimit) { 2741 maxSpanLimit=matchLimit+spanLength; 2742 if(maxSpanLimit==length) { 2743 return length; 2744 } 2745 } 2746 } 2747 } else /* spanCondition==USET_SPAN_SIMPLE */ { 2748 if(matchLimit>next) { 2749 // Remember longest match from start. 2750 next=matchLimit; 2751 } 2752 } 2753 } 2754 } 2755 if(next==start) { 2756 break; // No match from start. 2757 } 2758 start=next; 2759 } 2760 if(start>maxSpanLimit) { 2761 return start; 2762 } else { 2763 return maxSpanLimit; 2764 } 2765 } 2766 } 2767 2768 static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length, 2769 USetSpanCondition spanCondition) { 2770 if(length==0) { 2771 return 0; 2772 } 2773 const UnicodeSet &realSet(set.getSet()); 2774 if(!set.hasStrings()) { 2775 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2776 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2777 } 2778 2779 UChar32 c; 2780 int32_t prev=length; 2781 do { 2782 U8_PREV_OR_FFFD(s, 0, length, c); 2783 if(realSet.contains(c)!=spanCondition) { 2784 break; 2785 } 2786 } while((prev=length)>0); 2787 return prev; 2788 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) { 2789 UnicodeSetWithStringsIterator iter(set); 2790 UChar32 c; 2791 int32_t prev=length; 2792 do { 2793 U8_PREV_OR_FFFD(s, 0, length, c); 2794 if(realSet.contains(c)) { 2795 break; 2796 } 2797 const char *s8; 2798 int32_t length8; 2799 iter.reset(); 2800 while((s8=iter.nextUTF8(length8))!=NULL) { 2801 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) { 2802 // spanNeedsStrings=TRUE; 2803 return prev; 2804 } 2805 } 2806 } while((prev=length)>0); 2807 return prev; 2808 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ { 2809 UnicodeSetWithStringsIterator iter(set); 2810 UChar32 c; 2811 int32_t prev=length, minSpanStart=length; 2812 do { 2813 U8_PREV_OR_FFFD(s, 0, length, c); 2814 if(!realSet.contains(c)) { 2815 length=prev; // Do not span this single, not-contained code point. 2816 } 2817 const char *s8; 2818 int32_t length8; 2819 iter.reset(); 2820 while((s8=iter.nextUTF8(length8))!=NULL) { 2821 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) { 2822 // spanNeedsStrings=TRUE; 2823 int32_t matchStart=prev-length8; 2824 if(matchStart==0) { 2825 return 0; 2826 } 2827 if(spanCondition==USET_SPAN_CONTAINED) { 2828 // Iterate for the shortest match at each position. 2829 // Recurse for each but the shortest match. 2830 if(length==prev) { 2831 length=matchStart; // First match from prev. 2832 } else { 2833 if(matchStart>length) { 2834 // Remember shortest match from prev for iteration. 2835 int32_t temp=length; 2836 length=matchStart; 2837 matchStart=temp; 2838 } 2839 // Recurse for non-shortest match from prev. 2840 int32_t spanStart=containsSpanBackUTF8(set, s, matchStart, 2841 USET_SPAN_CONTAINED); 2842 if(spanStart<minSpanStart) { 2843 minSpanStart=spanStart; 2844 if(minSpanStart==0) { 2845 return 0; 2846 } 2847 } 2848 } 2849 } else /* spanCondition==USET_SPAN_SIMPLE */ { 2850 if(matchStart<length) { 2851 // Remember longest match from prev. 2852 length=matchStart; 2853 } 2854 } 2855 } 2856 } 2857 if(length==prev) { 2858 break; // No match from prev. 2859 } 2860 } while((prev=length)>0); 2861 if(prev<minSpanStart) { 2862 return prev; 2863 } else { 2864 return minSpanStart; 2865 } 2866 } 2867 } 2868 2869 // spans to be performed and compared 2870 enum { 2871 SPAN_UTF16 =1, 2872 SPAN_UTF8 =2, 2873 SPAN_UTFS =3, 2874 2875 SPAN_SET =4, 2876 SPAN_COMPLEMENT =8, 2877 SPAN_POLARITY =0xc, 2878 2879 SPAN_FWD =0x10, 2880 SPAN_BACK =0x20, 2881 SPAN_DIRS =0x30, 2882 2883 SPAN_CONTAINED =0x100, 2884 SPAN_SIMPLE =0x200, 2885 SPAN_CONDITION =0x300, 2886 2887 SPAN_ALL =0x33f 2888 }; 2889 2890 static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) { 2891 return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED; 2892 } 2893 2894 static inline int32_t slen(const void *s, UBool isUTF16) { 2895 return isUTF16 ? u_strlen((const UChar *)s) : strlen((const char *)s); 2896 } 2897 2898 /* 2899 * Count spans on a string with the method according to type and set the span limits. 2900 * The set may be the complement of the original. 2901 * When using spanBack() and comparing with span(), use a span condition for the first spanBack() 2902 * according to the expected number of spans. 2903 * Sets typeName to an empty string if there is no such type. 2904 * Returns -1 if the span option is filtered out. 2905 */ 2906 static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement, 2907 const void *s, int32_t length, UBool isUTF16, 2908 uint32_t whichSpans, 2909 int type, const char *&typeName, 2910 int32_t limits[], int32_t limitsCapacity, 2911 int32_t expectCount) { 2912 const UnicodeSet &realSet(set.getSet()); 2913 int32_t start, count; 2914 USetSpanCondition spanCondition, firstSpanCondition, contained; 2915 UBool isForward; 2916 2917 if(type<0 || 7<type) { 2918 typeName=""; 2919 return 0; 2920 } 2921 2922 static const char *const typeNames16[]={ 2923 "contains", "contains(LM)", 2924 "span", "span(LM)", 2925 "containsBack", "containsBack(LM)", 2926 "spanBack", "spanBack(LM)" 2927 }; 2928 2929 static const char *const typeNames8[]={ 2930 "containsUTF8", "containsUTF8(LM)", 2931 "spanUTF8", "spanUTF8(LM)", 2932 "containsBackUTF8", "containsBackUTF8(LM)", // not implemented 2933 "spanBackUTF8", "spanBackUTF8(LM)" 2934 }; 2935 2936 typeName= isUTF16 ? typeNames16[type] : typeNames8[type]; 2937 2938 // filter span options 2939 if(type<=3) { 2940 // span forward 2941 if((whichSpans&SPAN_FWD)==0) { 2942 return -1; 2943 } 2944 isForward=TRUE; 2945 } else { 2946 // span backward 2947 if((whichSpans&SPAN_BACK)==0) { 2948 return -1; 2949 } 2950 isForward=FALSE; 2951 } 2952 if((type&1)==0) { 2953 // use USET_SPAN_CONTAINED 2954 if((whichSpans&SPAN_CONTAINED)==0) { 2955 return -1; 2956 } 2957 contained=USET_SPAN_CONTAINED; 2958 } else { 2959 // use USET_SPAN_SIMPLE 2960 if((whichSpans&SPAN_SIMPLE)==0) { 2961 return -1; 2962 } 2963 contained=USET_SPAN_SIMPLE; 2964 } 2965 2966 // Default first span condition for going forward with an uncomplemented set. 2967 spanCondition=USET_SPAN_NOT_CONTAINED; 2968 if(isComplement) { 2969 spanCondition=invertSpanCondition(spanCondition, contained); 2970 } 2971 2972 // First span condition for span(), used to terminate the spanBack() iteration. 2973 firstSpanCondition=spanCondition; 2974 2975 // spanBack(): Its initial span condition is span()'s last span condition, 2976 // which is the opposite of span()'s first span condition 2977 // if we expect an even number of spans. 2978 // (The loop inverts spanCondition (expectCount-1) times 2979 // before the expectCount'th span() call.) 2980 // If we do not compare forward and backward directions, then we do not have an 2981 // expectCount and just start with firstSpanCondition. 2982 if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) { 2983 spanCondition=invertSpanCondition(spanCondition, contained); 2984 } 2985 2986 count=0; 2987 switch(type) { 2988 case 0: 2989 case 1: 2990 start=0; 2991 if(length<0) { 2992 length=slen(s, isUTF16); 2993 } 2994 for(;;) { 2995 start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) : 2996 containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition); 2997 if(count<limitsCapacity) { 2998 limits[count]=start; 2999 } 3000 ++count; 3001 if(start>=length) { 3002 break; 3003 } 3004 spanCondition=invertSpanCondition(spanCondition, contained); 3005 } 3006 break; 3007 case 2: 3008 case 3: 3009 start=0; 3010 for(;;) { 3011 start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) : 3012 realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition); 3013 if(count<limitsCapacity) { 3014 limits[count]=start; 3015 } 3016 ++count; 3017 if(length>=0 ? start>=length : 3018 isUTF16 ? ((const UChar *)s)[start]==0 : 3019 ((const char *)s)[start]==0 3020 ) { 3021 break; 3022 } 3023 spanCondition=invertSpanCondition(spanCondition, contained); 3024 } 3025 break; 3026 case 4: 3027 case 5: 3028 if(length<0) { 3029 length=slen(s, isUTF16); 3030 } 3031 for(;;) { 3032 ++count; 3033 if(count<=limitsCapacity) { 3034 limits[limitsCapacity-count]=length; 3035 } 3036 length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) : 3037 containsSpanBackUTF8(set, (const char *)s, length, spanCondition); 3038 if(length==0 && spanCondition==firstSpanCondition) { 3039 break; 3040 } 3041 spanCondition=invertSpanCondition(spanCondition, contained); 3042 } 3043 if(count<limitsCapacity) { 3044 memmove(limits, limits+(limitsCapacity-count), count*4); 3045 } 3046 break; 3047 case 6: 3048 case 7: 3049 for(;;) { 3050 ++count; 3051 if(count<=limitsCapacity) { 3052 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16); 3053 } 3054 // Note: Length<0 is tested only for the first spanBack(). 3055 // If we wanted to keep length<0 for all spanBack()s, we would have to 3056 // temporarily modify the string by placing a NUL where the previous spanBack() stopped. 3057 length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) : 3058 realSet.spanBackUTF8((const char *)s, length, spanCondition); 3059 if(length==0 && spanCondition==firstSpanCondition) { 3060 break; 3061 } 3062 spanCondition=invertSpanCondition(spanCondition, contained); 3063 } 3064 if(count<limitsCapacity) { 3065 memmove(limits, limits+(limitsCapacity-count), count*4); 3066 } 3067 break; 3068 default: 3069 typeName=""; 3070 return -1; 3071 } 3072 3073 return count; 3074 } 3075 3076 // sets to be tested; odd index=isComplement 3077 enum { 3078 SLOW, 3079 SLOW_NOT, 3080 FAST, 3081 FAST_NOT, 3082 SET_COUNT 3083 }; 3084 3085 static const char *const setNames[SET_COUNT]={ 3086 "slow", 3087 "slow.not", 3088 "fast", 3089 "fast.not" 3090 }; 3091 3092 /* 3093 * Verify that we get the same results whether we look at text with contains(), 3094 * span() or spanBack(), using unfrozen or frozen versions of the set, 3095 * and using the set or its complement (switching the spanConditions accordingly). 3096 * The latter verifies that 3097 * set.span(spanCondition) == set.complement().span(!spanCondition). 3098 * 3099 * The expectLimits[] are either provided by the caller (with expectCount>=0) 3100 * or returned to the caller (with an input expectCount<0). 3101 */ 3102 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4], 3103 const void *s, int32_t length, UBool isUTF16, 3104 uint32_t whichSpans, 3105 int32_t expectLimits[], int32_t &expectCount, 3106 const char *testName, int32_t index) { 3107 int32_t limits[500]; 3108 int32_t limitsCount; 3109 int i, j; 3110 3111 const char *typeName; 3112 int type; 3113 3114 for(i=0; i<SET_COUNT; ++i) { 3115 if((i&1)==0) { 3116 // Even-numbered sets are original, uncomplemented sets. 3117 if((whichSpans&SPAN_SET)==0) { 3118 continue; 3119 } 3120 } else { 3121 // Odd-numbered sets are complemented. 3122 if((whichSpans&SPAN_COMPLEMENT)==0) { 3123 continue; 3124 } 3125 } 3126 for(type=0;; ++type) { 3127 limitsCount=getSpans(*sets[i], (UBool)(i&1), 3128 s, length, isUTF16, 3129 whichSpans, 3130 type, typeName, 3131 limits, UPRV_LENGTHOF(limits), expectCount); 3132 if(typeName[0]==0) { 3133 break; // All types tried. 3134 } 3135 if(limitsCount<0) { 3136 continue; // Span option filtered out. 3137 } 3138 if(expectCount<0) { 3139 expectCount=limitsCount; 3140 if(limitsCount>UPRV_LENGTHOF(limits)) { 3141 errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans", 3142 testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)UPRV_LENGTHOF(limits)); 3143 return; 3144 } 3145 memcpy(expectLimits, limits, limitsCount*4); 3146 } else if(limitsCount!=expectCount) { 3147 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld", 3148 testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount); 3149 } else { 3150 for(j=0; j<limitsCount; ++j) { 3151 if(limits[j]!=expectLimits[j]) { 3152 errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld", 3153 testName, (long)index, setNames[i], typeName, (long)limitsCount, 3154 j, (long)limits[j], (long)expectLimits[j]); 3155 break; 3156 } 3157 } 3158 } 3159 } 3160 } 3161 3162 // Compare span() with containsAll()/containsNone(), 3163 // but only if we have expectLimits[] from the uncomplemented set. 3164 if(isUTF16 && (whichSpans&SPAN_SET)!=0) { 3165 const UChar *s16=(const UChar *)s; 3166 UnicodeString string; 3167 int32_t prev=0, limit, length; 3168 for(i=0; i<expectCount; ++i) { 3169 limit=expectLimits[i]; 3170 length=limit-prev; 3171 if(length>0) { 3172 string.setTo(FALSE, s16+prev, length); // read-only alias 3173 if(i&1) { 3174 if(!sets[SLOW]->getSet().containsAll(string)) { 3175 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()", 3176 testName, (long)index, setNames[SLOW], (long)prev, (long)limit); 3177 return; 3178 } 3179 if(!sets[FAST]->getSet().containsAll(string)) { 3180 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()", 3181 testName, (long)index, setNames[FAST], (long)prev, (long)limit); 3182 return; 3183 } 3184 } else { 3185 if(!sets[SLOW]->getSet().containsNone(string)) { 3186 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()", 3187 testName, (long)index, setNames[SLOW], (long)prev, (long)limit); 3188 return; 3189 } 3190 if(!sets[FAST]->getSet().containsNone(string)) { 3191 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()", 3192 testName, (long)index, setNames[FAST], (long)prev, (long)limit); 3193 return; 3194 } 3195 } 3196 } 3197 prev=limit; 3198 } 3199 } 3200 } 3201 3202 // Specifically test either UTF-16 or UTF-8. 3203 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4], 3204 const void *s, int32_t length, UBool isUTF16, 3205 uint32_t whichSpans, 3206 const char *testName, int32_t index) { 3207 int32_t expectLimits[500]; 3208 int32_t expectCount=-1; 3209 testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index); 3210 } 3211 3212 UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) { 3213 UChar c, c2; 3214 3215 if(length>=0) { 3216 while(length>0) { 3217 c=*s++; 3218 --length; 3219 if(0xd800<=c && c<0xe000) { 3220 if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) { 3221 return TRUE; 3222 } 3223 --length; 3224 } 3225 } 3226 } else { 3227 while((c=*s++)!=0) { 3228 if(0xd800<=c && c<0xe000) { 3229 if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) { 3230 return TRUE; 3231 } 3232 } 3233 } 3234 } 3235 return FALSE; 3236 } 3237 3238 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text, 3239 // unless either UTF is turned off in whichSpans. 3240 // Testing UTF-16 and UTF-8 together requires that surrogate code points 3241 // have the same contains(c) value as U+FFFD. 3242 void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4], 3243 const UChar *s16, int32_t length16, 3244 uint32_t whichSpans, 3245 const char *testName, int32_t index) { 3246 int32_t expectLimits[500]; 3247 int32_t expectCount; 3248 3249 expectCount=-1; // Get expectLimits[] from testSpan(). 3250 3251 if((whichSpans&SPAN_UTF16)!=0) { 3252 testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index); 3253 } 3254 if((whichSpans&SPAN_UTF8)==0) { 3255 return; 3256 } 3257 3258 // Convert s16[] and expectLimits[] to UTF-8. 3259 uint8_t s8[3000]; 3260 int32_t offsets[3000]; 3261 3262 const UChar *s16Limit=s16+length16; 3263 char *t=(char *)s8; 3264 char *tLimit=t+sizeof(s8); 3265 int32_t *o=offsets; 3266 UErrorCode errorCode=U_ZERO_ERROR; 3267 3268 // Convert with substitution: Turn unpaired surrogates into U+FFFD. 3269 ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode); 3270 if(U_FAILURE(errorCode)) { 3271 errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s", 3272 testName, (long)index, u_errorName(errorCode)); 3273 ucnv_resetFromUnicode(utf8Cnv); 3274 return; 3275 } 3276 int32_t length8=(int32_t)(t-(char *)s8); 3277 3278 // Convert expectLimits[]. 3279 int32_t i, j, expect; 3280 for(i=j=0; i<expectCount; ++i) { 3281 expect=expectLimits[i]; 3282 if(expect==length16) { 3283 expectLimits[i]=length8; 3284 } else { 3285 while(offsets[j]<expect) { 3286 ++j; 3287 } 3288 expectLimits[i]=j; 3289 } 3290 } 3291 3292 testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index); 3293 } 3294 3295 static UChar32 nextCodePoint(UChar32 c) { 3296 // Skip some large and boring ranges. 3297 switch(c) { 3298 case 0x3441: 3299 return 0x4d7f; 3300 case 0x5100: 3301 return 0x9f00; 3302 case 0xb040: 3303 return 0xd780; 3304 case 0xe041: 3305 return 0xf8fe; 3306 case 0x10100: 3307 return 0x20000; 3308 case 0x20041: 3309 return 0xe0000; 3310 case 0xe0101: 3311 return 0x10fffd; 3312 default: 3313 return c+1; 3314 } 3315 } 3316 3317 // Verify that all implementations represent the same set. 3318 void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) { 3319 // contains(U+FFFD) is inconsistent with contains(some surrogates), 3320 // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8: 3321 // Skip the UTF-8 part of the test - if the string contains surrogates - 3322 // because it is likely to produce a different result. 3323 UBool inconsistentSurrogates= 3324 (!(sets[0]->getSet().contains(0xfffd) ? 3325 sets[0]->getSet().contains(0xd800, 0xdfff) : 3326 sets[0]->getSet().containsNone(0xd800, 0xdfff)) || 3327 sets[0]->hasStringsWithSurrogates()); 3328 3329 UChar s[1000]; 3330 int32_t length=0; 3331 uint32_t localWhichSpans; 3332 3333 UChar32 c, first; 3334 for(first=c=0;; c=nextCodePoint(c)) { 3335 if(c>0x10ffff || length>(UPRV_LENGTHOF(s)-U16_MAX_LENGTH)) { 3336 localWhichSpans=whichSpans; 3337 if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) { 3338 localWhichSpans&=~SPAN_UTF8; 3339 } 3340 testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first); 3341 if(c>0x10ffff) { 3342 break; 3343 } 3344 length=0; 3345 first=c; 3346 } 3347 U16_APPEND_UNSAFE(s, length, c); 3348 } 3349 } 3350 3351 // Test with a particular, interesting string. 3352 // Specify length and try NUL-termination. 3353 void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) { 3354 static const UChar s[]={ 3355 0x61, 0x62, 0x20, // Latin, space 3356 0x3b1, 0x3b2, 0x3b3, // Greek 3357 0xd900, // lead surrogate 3358 0x3000, 0x30ab, 0x30ad, // wide space, Katakana 3359 0xdc05, // trail surrogate 3360 0xa0, 0xac00, 0xd7a3, // nbsp, Hangul 3361 0xd900, 0xdc05, // unassigned supplementary 3362 0xd840, 0xdfff, 0xd860, 0xdffe, // Han supplementary 3363 0xd7a4, 0xdc05, 0xd900, 0x2028, // unassigned, surrogates in wrong order, LS 3364 0 // NUL 3365 }; 3366 3367 if((whichSpans&SPAN_UTF16)==0) { 3368 return; 3369 } 3370 testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0); 3371 testSpan(sets, s, UPRV_LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1); 3372 } 3373 3374 void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) { 3375 static const char s[]={ 3376 "abc" // Latin 3377 3378 /* trail byte in lead position */ 3379 "\x80" 3380 3381 " " // space 3382 3383 /* truncated multi-byte sequences */ 3384 "\xd0" 3385 "\xe0" 3386 "\xe1" 3387 "\xed" 3388 "\xee" 3389 "\xf0" 3390 "\xf1" 3391 "\xf4" 3392 "\xf8" 3393 "\xfc" 3394 3395 "\xCE\xB1\xCE\xB2\xCE\xB3" // Greek 3396 3397 /* trail byte in lead position */ 3398 "\x80" 3399 3400 "\xe0\x80" 3401 "\xe0\xa0" 3402 "\xe1\x80" 3403 "\xed\x80" 3404 "\xed\xa0" 3405 "\xee\x80" 3406 "\xf0\x80" 3407 "\xf0\x90" 3408 "\xf1\x80" 3409 "\xf4\x80" 3410 "\xf4\x90" 3411 "\xf8\x80" 3412 "\xfc\x80" 3413 3414 "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD" // wide space, Katakana 3415 3416 /* trail byte in lead position */ 3417 "\x80" 3418 3419 "\xf0\x80\x80" 3420 "\xf0\x90\x80" 3421 "\xf1\x80\x80" 3422 "\xf4\x80\x80" 3423 "\xf4\x90\x80" 3424 "\xf8\x80\x80" 3425 "\xfc\x80\x80" 3426 3427 "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3" // nbsp, Hangul 3428 3429 /* trail byte in lead position */ 3430 "\x80" 3431 3432 "\xf8\x80\x80\x80" 3433 "\xfc\x80\x80\x80" 3434 3435 "\xF1\x90\x80\x85" // unassigned supplementary 3436 3437 /* trail byte in lead position */ 3438 "\x80" 3439 3440 "\xfc\x80\x80\x80\x80" 3441 3442 "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE" // Han supplementary 3443 3444 /* trail byte in lead position */ 3445 "\x80" 3446 3447 /* complete sequences but non-shortest forms or out of range etc. */ 3448 "\xc0\x80" 3449 "\xe0\x80\x80" 3450 "\xed\xa0\x80" 3451 "\xf0\x80\x80\x80" 3452 "\xf4\x90\x80\x80" 3453 "\xf8\x80\x80\x80\x80" 3454 "\xfc\x80\x80\x80\x80\x80" 3455 "\xfe" 3456 "\xff" 3457 3458 /* trail byte in lead position */ 3459 "\x80" 3460 3461 "\xED\x9E\xA4\xE2\x80\xA8" // unassigned, LS, NUL-terminated 3462 }; 3463 3464 if((whichSpans&SPAN_UTF8)==0) { 3465 return; 3466 } 3467 testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0); 3468 testSpan(sets, s, UPRV_LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1); 3469 } 3470 3471 // Take a set of span options and multiply them so that 3472 // each portion only has one of the options a, b and c. 3473 // If b==0, then the set of options is just modified with mask and a. 3474 // If b!=0 and c==0, then the set of options is just modified with mask, a and b. 3475 static int32_t 3476 addAlternative(uint32_t whichSpans[], int32_t whichSpansCount, 3477 uint32_t mask, uint32_t a, uint32_t b, uint32_t c) { 3478 uint32_t s; 3479 int32_t i; 3480 3481 for(i=0; i<whichSpansCount; ++i) { 3482 s=whichSpans[i]&mask; 3483 whichSpans[i]=s|a; 3484 if(b!=0) { 3485 whichSpans[whichSpansCount+i]=s|b; 3486 if(c!=0) { 3487 whichSpans[2*whichSpansCount+i]=s|c; 3488 } 3489 } 3490 } 3491 return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount; 3492 } 3493 3494 #define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" 3495 #define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" 3496 #define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" 3497 #define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" 3498 3499 void UnicodeSetTest::TestSpan() { 3500 // "[...]" is a UnicodeSet pattern. 3501 // "*" performs tests on all Unicode code points and on a selection of 3502 // malformed UTF-8/16 strings. 3503 // "-options" limits the scope of testing for the current set. 3504 // By default, the test verifies that equivalent boundaries are found 3505 // for UTF-16 and UTF-8, going forward and backward, 3506 // alternating USET_SPAN_NOT_CONTAINED with 3507 // either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE. 3508 // Single-character options: 3509 // 8 -- UTF-16 and UTF-8 boundaries may differ. 3510 // Cause: contains(U+FFFD) is inconsistent with contains(some surrogates), 3511 // or the set contains strings with unpaired surrogates 3512 // which do not translate to valid UTF-8. 3513 // c -- set.span() and set.complement().span() boundaries may differ. 3514 // Cause: Set strings are not complemented. 3515 // b -- span() and spanBack() boundaries may differ. 3516 // Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED) 3517 // and spanBack(USET_SPAN_SIMPLE) are defined to 3518 // match with non-overlapping substrings. 3519 // For example, with a set containing "ab" and "ba", 3520 // span() of "aba" yields boundaries { 0, 2, 3 } 3521 // because the initial "ab" matches from 0 to 2, 3522 // while spanBack() yields boundaries { 0, 1, 3 } 3523 // because the final "ba" matches from 1 to 3. 3524 // l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ. 3525 // Cause: Strings in the set overlap, and a longer match may 3526 // require a sequence including non-longest substrings. 3527 // For example, with a set containing "ab", "abc" and "cd", 3528 // span(contained) of "abcd" spans the entire string 3529 // but span(longest match) only spans the first 3 characters. 3530 // Each "-options" first resets all options and then applies the specified options. 3531 // A "-" without options resets the options. 3532 // The options are also reset for each new set. 3533 // Other strings will be spanned. 3534 static const char *const testdata[]={ 3535 "[:ID_Continue:]", 3536 "*", 3537 "[:White_Space:]", 3538 "*", 3539 "[]", 3540 "*", 3541 "[\\u0000-\\U0010FFFF]", 3542 "*", 3543 "[\\u0000\\u0080\\u0800\\U00010000]", 3544 "*", 3545 "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]", 3546 "*", 3547 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]", 3548 "-c", 3549 "*", 3550 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]", 3551 "-c", 3552 "*", 3553 3554 // Overlapping strings cause overlapping attempts to match. 3555 "[x{xy}{xya}{axy}{ax}]", 3556 "-cl", 3557 3558 // More repetitions of "xya" would take too long with the recursive 3559 // reference implementation. 3560 // containsAll()=FALSE 3561 // test_string 0x14 3562 "xx" 3563 "xyaxyaxyaxya" // set.complement().span(longest match) will stop here. 3564 "xx" // set.complement().span(contained) will stop between the two 'x'es. 3565 "xyaxyaxyaxya" 3566 "xx" 3567 "xyaxyaxyaxya" // span() ends here. 3568 "aaa", 3569 3570 // containsAll()=TRUE 3571 // test_string 0x15 3572 "xx" 3573 "xyaxyaxyaxya" 3574 "xx" 3575 "xyaxyaxyaxya" 3576 "xx" 3577 "xyaxyaxyaxy", 3578 3579 "-bc", 3580 // test_string 0x17 3581 "byayaxya", // span() -> { 4, 7, 8 } spanBack() -> { 5, 8 } 3582 "-c", 3583 "byayaxy", // span() -> { 4, 7 } complement.span() -> { 7 } 3584 "byayax", // span() -> { 4, 6 } complement.span() -> { 6 } 3585 "-", 3586 "byaya", // span() -> { 5 } 3587 "byay", // span() -> { 4 } 3588 "bya", // span() -> { 3 } 3589 3590 // span(longest match) will not span the whole string. 3591 "[a{ab}{bc}]", 3592 "-cl", 3593 // test_string 0x21 3594 "abc", 3595 3596 "[a{ab}{abc}{cd}]", 3597 "-cl", 3598 "acdabcdabccd", 3599 3600 // spanBack(longest match) will not span the whole string. 3601 "[c{ab}{bc}]", 3602 "-cl", 3603 "abc", 3604 3605 "[d{cd}{bcd}{ab}]", 3606 "-cl", 3607 "abbcdabcdabd", 3608 3609 // Test with non-ASCII set strings - test proper handling of surrogate pairs 3610 // and UTF-8 trail bytes. 3611 // Copies of above test sets and strings, but transliterated to have 3612 // different code points with similar trail units. 3613 // Previous: a b c d 3614 // Unicode: 042B 30AB 200AB 204AB 3615 // UTF-16: 042B 30AB D840 DCAB D841 DCAB 3616 // UTF-8: D0 AB E3 82 AB F0 A0 82 AB F0 A0 92 AB 3617 "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]", 3618 "-cl", 3619 "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB", 3620 3621 "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]", 3622 "-cl", 3623 "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB", 3624 3625 // Stress bookkeeping and recursion. 3626 // The following strings are barely doable with the recursive 3627 // reference implementation. 3628 // The not-contained character at the end prevents an early exit from the span(). 3629 "[b{bb}]", 3630 "-c", 3631 // test_string 0x33 3632 "bbbbbbbbbbbbbbbbbbbbbbbb-", 3633 // On complement sets, span() and spanBack() get different results 3634 // because b is not in the complement set and there is an odd number of b's 3635 // in the test string. 3636 "-bc", 3637 "bbbbbbbbbbbbbbbbbbbbbbbbb-", 3638 3639 // Test with set strings with an initial or final code point span 3640 // longer than 254. 3641 "[a{" _64_a _64_a _64_a _64_a "b}" 3642 "{a" _64_b _64_b _64_b _64_b "}]", 3643 "-c", 3644 _64_a _64_a _64_a _63_a "b", 3645 _64_a _64_a _64_a _64_a "b", 3646 _64_a _64_a _64_a _64_a "aaaabbbb", 3647 "a" _64_b _64_b _64_b _63_b, 3648 "a" _64_b _64_b _64_b _64_b, 3649 "aaaabbbb" _64_b _64_b _64_b _64_b, 3650 3651 // Test with strings containing unpaired surrogates. 3652 // They are not representable in UTF-8, and a leading trail surrogate 3653 // and a trailing lead surrogate must not match in the middle of a proper surrogate pair. 3654 // U+20001 == \\uD840\\uDC01 3655 // U+20400 == \\uD841\\uDC00 3656 "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]", 3657 "-8cl", 3658 "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb" 3659 }; 3660 uint32_t whichSpans[96]={ SPAN_ALL }; 3661 int32_t whichSpansCount=1; 3662 3663 UnicodeSet *sets[SET_COUNT]={ NULL }; 3664 const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL }; 3665 3666 char testName[1024]; 3667 char *testNameLimit=testName; 3668 3669 int32_t i, j; 3670 for(i=0; i<UPRV_LENGTHOF(testdata); ++i) { 3671 const char *s=testdata[i]; 3672 if(s[0]=='[') { 3673 // Create new test sets from this pattern. 3674 for(j=0; j<SET_COUNT; ++j) { 3675 delete sets_with_str[j]; 3676 delete sets[j]; 3677 } 3678 UErrorCode errorCode=U_ZERO_ERROR; 3679 sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode); 3680 if(U_FAILURE(errorCode)) { 3681 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode)); 3682 break; 3683 } 3684 sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]); 3685 sets[SLOW_NOT]->complement(); 3686 // Intermediate set: Test cloning of a frozen set. 3687 UnicodeSet *fast=new UnicodeSet(*sets[SLOW]); 3688 fast->freeze(); 3689 sets[FAST]=(UnicodeSet *)fast->clone(); 3690 delete fast; 3691 UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]); 3692 fastNot->freeze(); 3693 sets[FAST_NOT]=(UnicodeSet *)fastNot->clone(); 3694 delete fastNot; 3695 3696 for(j=0; j<SET_COUNT; ++j) { 3697 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]); 3698 } 3699 3700 strcpy(testName, s); 3701 testNameLimit=strchr(testName, 0); 3702 *testNameLimit++=':'; 3703 *testNameLimit=0; 3704 3705 whichSpans[0]=SPAN_ALL; 3706 whichSpansCount=1; 3707 } else if(s[0]=='-') { 3708 whichSpans[0]=SPAN_ALL; 3709 whichSpansCount=1; 3710 3711 while(*++s!=0) { 3712 switch(*s) { 3713 case 'c': 3714 whichSpansCount=addAlternative(whichSpans, whichSpansCount, 3715 ~SPAN_POLARITY, 3716 SPAN_SET, 3717 SPAN_COMPLEMENT, 3718 0); 3719 break; 3720 case 'b': 3721 whichSpansCount=addAlternative(whichSpans, whichSpansCount, 3722 ~SPAN_DIRS, 3723 SPAN_FWD, 3724 SPAN_BACK, 3725 0); 3726 break; 3727 case 'l': 3728 // test USET_SPAN_CONTAINED FWD & BACK, and separately 3729 // USET_SPAN_SIMPLE only FWD, and separately 3730 // USET_SPAN_SIMPLE only BACK 3731 whichSpansCount=addAlternative(whichSpans, whichSpansCount, 3732 ~(SPAN_DIRS|SPAN_CONDITION), 3733 SPAN_DIRS|SPAN_CONTAINED, 3734 SPAN_FWD|SPAN_SIMPLE, 3735 SPAN_BACK|SPAN_SIMPLE); 3736 break; 3737 case '8': 3738 whichSpansCount=addAlternative(whichSpans, whichSpansCount, 3739 ~SPAN_UTFS, 3740 SPAN_UTF16, 3741 SPAN_UTF8, 3742 0); 3743 break; 3744 default: 3745 errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]); 3746 break; 3747 } 3748 } 3749 } else if(0==strcmp(s, "*")) { 3750 strcpy(testNameLimit, "bad_string"); 3751 for(j=0; j<whichSpansCount; ++j) { 3752 if(whichSpansCount>1) { 3753 sprintf(testNameLimit+10 /* strlen("bad_string") */, 3754 "%%0x%3x", 3755 whichSpans[j]); 3756 } 3757 testSpanUTF16String(sets_with_str, whichSpans[j], testName); 3758 testSpanUTF8String(sets_with_str, whichSpans[j], testName); 3759 } 3760 3761 strcpy(testNameLimit, "contents"); 3762 for(j=0; j<whichSpansCount; ++j) { 3763 if(whichSpansCount>1) { 3764 sprintf(testNameLimit+8 /* strlen("contents") */, 3765 "%%0x%3x", 3766 whichSpans[j]); 3767 } 3768 testSpanContents(sets_with_str, whichSpans[j], testName); 3769 } 3770 } else { 3771 UnicodeString string=UnicodeString(s, -1, US_INV).unescape(); 3772 strcpy(testNameLimit, "test_string"); 3773 for(j=0; j<whichSpansCount; ++j) { 3774 if(whichSpansCount>1) { 3775 sprintf(testNameLimit+11 /* strlen("test_string") */, 3776 "%%0x%3x", 3777 whichSpans[j]); 3778 } 3779 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i); 3780 } 3781 } 3782 } 3783 for(j=0; j<SET_COUNT; ++j) { 3784 delete sets_with_str[j]; 3785 delete sets[j]; 3786 } 3787 } 3788 3789 // Test select patterns and strings, and test USET_SPAN_SIMPLE. 3790 void UnicodeSetTest::TestStringSpan() { 3791 static const char *pattern="[x{xy}{xya}{axy}{ax}]"; 3792 static const char *const string= 3793 "xx" 3794 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya" 3795 "xx" 3796 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya" 3797 "xx" 3798 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy" 3799 "aaaa"; 3800 3801 UErrorCode errorCode=U_ZERO_ERROR; 3802 UnicodeString pattern16=UnicodeString(pattern, -1, US_INV); 3803 UnicodeSet set(pattern16, errorCode); 3804 if(U_FAILURE(errorCode)) { 3805 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode)); 3806 return; 3807 } 3808 3809 UnicodeString string16=UnicodeString(string, -1, US_INV).unescape(); 3810 3811 if(set.containsAll(string16)) { 3812 errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string); 3813 } 3814 3815 // Remove trailing "aaaa". 3816 string16.truncate(string16.length()-4); 3817 if(!set.containsAll(string16)) { 3818 errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string); 3819 } 3820 3821 string16=UNICODE_STRING_SIMPLE("byayaxya"); 3822 const UChar *s16=string16.getBuffer(); 3823 int32_t length16=string16.length(); 3824 (void)length16; // Suppress set but not used warning. 3825 if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 || 3826 set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 || 3827 set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 || 3828 set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 || 3829 set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 || 3830 set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3 3831 ) { 3832 errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern); 3833 } 3834 3835 pattern="[a{ab}{abc}{cd}]"; 3836 pattern16=UnicodeString(pattern, -1, US_INV); 3837 set.applyPattern(pattern16, errorCode); 3838 if(U_FAILURE(errorCode)) { 3839 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode)); 3840 return; 3841 } 3842 string16=UNICODE_STRING_SIMPLE("acdabcdabccd"); 3843 s16=string16.getBuffer(); 3844 length16=string16.length(); 3845 if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 || 3846 set.span(s16, 12, USET_SPAN_SIMPLE)!=6 || 3847 set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5 3848 ) { 3849 errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern); 3850 } 3851 3852 pattern="[d{cd}{bcd}{ab}]"; 3853 pattern16=UnicodeString(pattern, -1, US_INV); 3854 set.applyPattern(pattern16, errorCode).freeze(); 3855 if(U_FAILURE(errorCode)) { 3856 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode)); 3857 return; 3858 } 3859 string16=UNICODE_STRING_SIMPLE("abbcdabcdabd"); 3860 s16=string16.getBuffer(); 3861 length16=string16.length(); 3862 if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 || 3863 set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 || 3864 set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0 3865 ) { 3866 errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern); 3867 } 3868 } 3869 3870 /** 3871 * Including collationroot.h fails here with 3872 1>c:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\include\driverspecs.h(142): error C2008: '$' : unexpected in macro definition 3873 * .. so, we skip this test on Windows. 3874 * 3875 * the cause is that intltest builds with /Za which disables language extensions - which means 3876 * windows header files can't be used. 3877 */ 3878 #if !UCONFIG_NO_COLLATION && !U_PLATFORM_HAS_WIN32_API 3879 #include "collationroot.h" 3880 #include "collationtailoring.h" 3881 #endif 3882 3883 void UnicodeSetTest::TestUCAUnsafeBackwards() { 3884 #if U_PLATFORM_HAS_WIN32_API 3885 infoln("Skipping TestUCAUnsafeBackwards() - can't include collationroot.h on Windows without language extensions!"); 3886 #elif !UCONFIG_NO_COLLATION 3887 UErrorCode errorCode = U_ZERO_ERROR; 3888 3889 // Get the unsafeBackwardsSet 3890 const CollationCacheEntry *rootEntry = CollationRoot::getRootCacheEntry(errorCode); 3891 if(U_FAILURE(errorCode)) { 3892 dataerrln("FAIL: %s getting root cache entry", u_errorName(errorCode)); 3893 return; 3894 } 3895 //const UVersionInfo &version = rootEntry->tailoring->version; 3896 const UnicodeSet *unsafeBackwardSet = rootEntry->tailoring->unsafeBackwardSet; 3897 3898 checkSerializeRoundTrip(*unsafeBackwardSet, errorCode); 3899 3900 if(!logKnownIssue("11891","UnicodeSet fails to round trip on CollationRoot...unsafeBackwards set")) { 3901 // simple test case 3902 // TODO(ticket #11891): Simplify this test function to this simple case. Rename it appropriately. 3903 // TODO(ticket #11891): Port test to Java. Is this a bug there, too? 3904 UnicodeSet surrogates; 3905 surrogates.add(0xd83a); // a lead surrogate 3906 surrogates.add(0xdc00, 0xdfff); // a range of trail surrogates 3907 UnicodeString pat; 3908 surrogates.toPattern(pat, FALSE); // bad: [ 0xd83a, 0xdc00, 0x2d, 0xdfff ] 3909 // TODO: Probably fix either UnicodeSet::_generatePattern() or _appendToPat() 3910 // so that at least one type of surrogate code points are escaped, 3911 // or (minimally) so that adjacent lead+trail surrogate code points are escaped. 3912 errorCode = U_ZERO_ERROR; 3913 UnicodeSet s2; 3914 s2.applyPattern(pat, errorCode); // looks like invalid range [ 0x1e800, 0x2d, 0xdfff ] 3915 if(U_FAILURE(errorCode)) { 3916 errln("FAIL: surrogates to/from pattern - %s", u_errorName(errorCode)); 3917 } else { 3918 checkEqual(surrogates, s2, "surrogates to/from pattern"); 3919 } 3920 // This occurs in the UCA unsafe-backwards set. 3921 checkRoundTrip(*unsafeBackwardSet); 3922 } 3923 #endif 3924 } 3925