1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (c) 2002-2016, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 */ 9 #include "unicode/uset.h" 10 #include "unicode/ustring.h" 11 #include "cintltst.h" 12 #include "cmemory.h" 13 #include <stdlib.h> 14 #include <string.h> 15 16 #define TEST(x) addTest(root, &x, "uset/" # x) 17 18 static void TestAPI(void); 19 static void Testj2269(void); 20 static void TestSerialized(void); 21 static void TestNonInvariantPattern(void); 22 static void TestBadPattern(void); 23 static void TestFreezable(void); 24 static void TestSpan(void); 25 26 void addUSetTest(TestNode** root); 27 28 static void expect(const USet* set, 29 const char* inList, 30 const char* outList, 31 UErrorCode* ec); 32 static void expectContainment(const USet* set, 33 const char* list, 34 UBool isIn); 35 static char oneUCharToChar(UChar32 c); 36 static void expectItems(const USet* set, 37 const char* items); 38 39 void 40 addUSetTest(TestNode** root) { 41 TEST(TestAPI); 42 TEST(Testj2269); 43 TEST(TestSerialized); 44 TEST(TestNonInvariantPattern); 45 TEST(TestBadPattern); 46 TEST(TestFreezable); 47 TEST(TestSpan); 48 } 49 50 /*------------------------------------------------------------------ 51 * Tests 52 *------------------------------------------------------------------*/ 53 54 static void Testj2269() { 55 UErrorCode status = U_ZERO_ERROR; 56 UChar a[4] = { 0x61, 0x62, 0x63, 0 }; 57 USet *s = uset_open(1, 0); 58 uset_addString(s, a, 3); 59 a[0] = 0x63; a[1] = 0x63; 60 expect(s, "{abc}", "{ccc}", &status); 61 uset_close(s); 62 } 63 64 static const UChar PAT[] = {91,97,45,99,123,97,98,125,93,0}; /* "[a-c{ab}]" */ 65 static const int32_t PAT_LEN = UPRV_LENGTHOF(PAT) - 1; 66 67 static const UChar PAT_lb[] = {0x6C, 0x62, 0}; /* "lb" */ 68 static const int32_t PAT_lb_LEN = UPRV_LENGTHOF(PAT_lb) - 1; 69 70 static const UChar VAL_SP[] = {0x53, 0x50, 0}; /* "SP" */ 71 static const int32_t VAL_SP_LEN = UPRV_LENGTHOF(VAL_SP) - 1; 72 73 static const UChar STR_bc[] = {98,99,0}; /* "bc" */ 74 static const int32_t STR_bc_LEN = UPRV_LENGTHOF(STR_bc) - 1; 75 76 static const UChar STR_ab[] = {97,98,0}; /* "ab" */ 77 static const int32_t STR_ab_LEN = UPRV_LENGTHOF(STR_ab) - 1; 78 79 /** 80 * Basic API test for uset.x 81 */ 82 static void TestAPI() { 83 USet* set; 84 USet* set2; 85 UErrorCode ec; 86 87 /* [] */ 88 set = uset_openEmpty(); 89 expect(set, "", "abc{ab}", NULL); 90 uset_close(set); 91 92 set = uset_open(1, 0); 93 expect(set, "", "abc{ab}", NULL); 94 uset_close(set); 95 96 set = uset_open(1, 1); 97 uset_clear(set); 98 expect(set, "", "abc{ab}", NULL); 99 uset_close(set); 100 101 /* [ABC] */ 102 set = uset_open(0x0041, 0x0043); 103 expect(set, "ABC", "DEF{ab}", NULL); 104 uset_close(set); 105 106 /* [a-c{ab}] */ 107 ec = U_ZERO_ERROR; 108 set = uset_openPattern(PAT, PAT_LEN, &ec); 109 if(U_FAILURE(ec)) { 110 log_err("uset_openPattern([a-c{ab}]) failed - %s\n", u_errorName(ec)); 111 return; 112 } 113 if(!uset_resemblesPattern(PAT, PAT_LEN, 0)) { 114 log_err("uset_resemblesPattern of PAT failed\n"); 115 } 116 expect(set, "abc{ab}", "def{bc}", &ec); 117 118 /* [a-d{ab}] */ 119 uset_add(set, 0x64); 120 expect(set, "abcd{ab}", "ef{bc}", NULL); 121 122 /* [acd{ab}{bc}] */ 123 uset_remove(set, 0x62); 124 uset_addString(set, STR_bc, STR_bc_LEN); 125 expect(set, "acd{ab}{bc}", "bef{cd}", NULL); 126 127 /* [acd{bc}] */ 128 uset_removeString(set, STR_ab, STR_ab_LEN); 129 expect(set, "acd{bc}", "bfg{ab}", NULL); 130 131 /* [^acd{bc}] */ 132 uset_complement(set); 133 expect(set, "bef{bc}", "acd{ac}", NULL); 134 135 /* [a-e{bc}] */ 136 uset_complement(set); 137 uset_addRange(set, 0x0062, 0x0065); 138 expect(set, "abcde{bc}", "fg{ab}", NULL); 139 140 /* [de{bc}] */ 141 uset_removeRange(set, 0x0050, 0x0063); 142 expect(set, "de{bc}", "bcfg{ab}", NULL); 143 144 /* [g-l] */ 145 uset_set(set, 0x0067, 0x006C); 146 expect(set, "ghijkl", "de{bc}", NULL); 147 148 if (uset_indexOf(set, 0x0067) != 0) { 149 log_err("uset_indexOf failed finding correct index of 'g'\n"); 150 } 151 152 if (uset_charAt(set, 0) != 0x0067) { 153 log_err("uset_charAt failed finding correct char 'g' at index 0\n"); 154 } 155 156 /* How to test this one...? */ 157 uset_compact(set); 158 159 /* [g-i] */ 160 uset_retain(set, 0x0067, 0x0069); 161 expect(set, "ghi", "dejkl{bc}", NULL); 162 163 /* UCHAR_ASCII_HEX_DIGIT */ 164 uset_applyIntPropertyValue(set, UCHAR_ASCII_HEX_DIGIT, 1, &ec); 165 if(U_FAILURE(ec)) { 166 log_err("uset_applyIntPropertyValue([UCHAR_ASCII_HEX_DIGIT]) failed - %s\n", u_errorName(ec)); 167 return; 168 } 169 expect(set, "0123456789ABCDEFabcdef", "GHIjkl{bc}", NULL); 170 171 /* [ab] */ 172 uset_clear(set); 173 uset_addAllCodePoints(set, STR_ab, STR_ab_LEN); 174 expect(set, "ab", "def{ab}", NULL); 175 if (uset_containsAllCodePoints(set, STR_bc, STR_bc_LEN)){ 176 log_err("set should not conatin all characters of \"bc\" \n"); 177 } 178 179 /* [] */ 180 set2 = uset_open(1, 1); 181 uset_clear(set2); 182 183 /* space */ 184 uset_applyPropertyAlias(set2, PAT_lb, PAT_lb_LEN, VAL_SP, VAL_SP_LEN, &ec); 185 expect(set2, " ", "abcdefghi{bc}", NULL); 186 187 /* [a-c] */ 188 uset_set(set2, 0x0061, 0x0063); 189 /* [g-i] */ 190 uset_set(set, 0x0067, 0x0069); 191 192 /* [a-c g-i] */ 193 if (uset_containsSome(set, set2)) { 194 log_err("set should not contain some of set2 yet\n"); 195 } 196 uset_complementAll(set, set2); 197 if (!uset_containsSome(set, set2)) { 198 log_err("set should contain some of set2\n"); 199 } 200 expect(set, "abcghi", "def{bc}", NULL); 201 202 /* [g-i] */ 203 uset_removeAll(set, set2); 204 expect(set, "ghi", "abcdef{bc}", NULL); 205 206 /* [a-c g-i] */ 207 uset_addAll(set2, set); 208 expect(set2, "abcghi", "def{bc}", NULL); 209 210 /* [g-i] */ 211 uset_retainAll(set2, set); 212 expect(set2, "ghi", "abcdef{bc}", NULL); 213 214 uset_close(set); 215 uset_close(set2); 216 } 217 218 /*------------------------------------------------------------------ 219 * Support 220 *------------------------------------------------------------------*/ 221 222 /** 223 * Verifies that the given set contains the characters and strings in 224 * inList, and does not contain those in outList. Also verifies that 225 * 'set' is not NULL and that 'ec' succeeds. 226 * @param set the set to test, or NULL (on error) 227 * @param inList list of set contents, in iteration order. Format is 228 * list of individual strings, in iteration order, followed by sorted 229 * list of strings, delimited by {}. This means we do not test 230 * characters '{' or '}' and we do not test strings containing those 231 * characters either. 232 * @param outList list of things not in the set. Same format as 233 * inList. 234 * @param ec an error code, checked for success. May be NULL in which 235 * case it is ignored. 236 */ 237 static void expect(const USet* set, 238 const char* inList, 239 const char* outList, 240 UErrorCode* ec) { 241 if (ec!=NULL && U_FAILURE(*ec)) { 242 log_err("FAIL: %s\n", u_errorName(*ec)); 243 return; 244 } 245 if (set == NULL) { 246 log_err("FAIL: USet is NULL\n"); 247 return; 248 } 249 expectContainment(set, inList, TRUE); 250 expectContainment(set, outList, FALSE); 251 expectItems(set, inList); 252 } 253 254 static void expectContainment(const USet* set, 255 const char* list, 256 UBool isIn) { 257 const char* p = list; 258 UChar ustr[4096]; 259 char *pat; 260 UErrorCode ec; 261 int32_t rangeStart = -1, rangeEnd = -1, length; 262 263 ec = U_ZERO_ERROR; 264 length = uset_toPattern(set, ustr, sizeof(ustr), TRUE, &ec); 265 if(U_FAILURE(ec)) { 266 log_err("FAIL: uset_toPattern() fails in expectContainment() - %s\n", u_errorName(ec)); 267 return; 268 } 269 pat=aescstrdup(ustr, length); 270 271 while (*p) { 272 if (*p=='{') { 273 const char* stringStart = ++p; 274 int32_t stringLength = 0; 275 char strCopy[64]; 276 277 while (*p++ != '}') { 278 } 279 stringLength = (int32_t)(p - stringStart - 1); 280 strncpy(strCopy, stringStart, stringLength); 281 strCopy[stringLength] = 0; 282 283 u_charsToUChars(stringStart, ustr, stringLength); 284 285 if (uset_containsString(set, ustr, stringLength) == isIn) { 286 log_verbose("Ok: %s %s \"%s\"\n", pat, 287 (isIn ? "contains" : "does not contain"), 288 strCopy); 289 } else { 290 log_data_err("FAIL: %s %s \"%s\" (Are you missing data?)\n", pat, 291 (isIn ? "does not contain" : "contains"), 292 strCopy); 293 } 294 } 295 296 else { 297 UChar32 c; 298 299 u_charsToUChars(p, ustr, 1); 300 c = ustr[0]; 301 302 if (uset_contains(set, c) == isIn) { 303 log_verbose("Ok: %s %s '%c'\n", pat, 304 (isIn ? "contains" : "does not contain"), 305 *p); 306 } else { 307 log_data_err("FAIL: %s %s '%c' (Are you missing data?)\n", pat, 308 (isIn ? "does not contain" : "contains"), 309 *p); 310 } 311 312 /* Test the range API too by looking for ranges */ 313 if (c == rangeEnd+1) { 314 rangeEnd = c; 315 } else { 316 if (rangeStart >= 0) { 317 if (uset_containsRange(set, rangeStart, rangeEnd) == isIn) { 318 log_verbose("Ok: %s %s U+%04X-U+%04X\n", pat, 319 (isIn ? "contains" : "does not contain"), 320 rangeStart, rangeEnd); 321 } else { 322 log_data_err("FAIL: %s %s U+%04X-U+%04X (Are you missing data?)\n", pat, 323 (isIn ? "does not contain" : "contains"), 324 rangeStart, rangeEnd); 325 } 326 } 327 rangeStart = rangeEnd = c; 328 } 329 330 ++p; 331 } 332 } 333 334 if (rangeStart >= 0) { 335 if (uset_containsRange(set, rangeStart, rangeEnd) == isIn) { 336 log_verbose("Ok: %s %s U+%04X-U+%04X\n", pat, 337 (isIn ? "contains" : "does not contain"), 338 rangeStart, rangeEnd); 339 } else { 340 log_data_err("FAIL: %s %s U+%04X-U+%04X (Are you missing data?)\n", pat, 341 (isIn ? "does not contain" : "contains"), 342 rangeStart, rangeEnd); 343 } 344 } 345 } 346 347 /* This only works for invariant BMP chars */ 348 static char oneUCharToChar(UChar32 c) { 349 UChar ubuf[1]; 350 char buf[1]; 351 ubuf[0] = (UChar) c; 352 u_UCharsToChars(ubuf, buf, 1); 353 return buf[0]; 354 } 355 356 static void expectItems(const USet* set, 357 const char* items) { 358 const char* p = items; 359 UChar ustr[4096], itemStr[4096]; 360 char buf[4096]; 361 char *pat; 362 UErrorCode ec; 363 int32_t expectedSize = 0; 364 int32_t itemCount = uset_getItemCount(set); 365 int32_t itemIndex = 0; 366 UChar32 start = 1, end = 0; 367 int32_t itemLen = 0, length; 368 369 ec = U_ZERO_ERROR; 370 length = uset_toPattern(set, ustr, sizeof(ustr), TRUE, &ec); 371 if (U_FAILURE(ec)) { 372 log_err("FAIL: uset_toPattern => %s\n", u_errorName(ec)); 373 return; 374 } 375 pat=aescstrdup(ustr, length); 376 377 if (uset_isEmpty(set) != (strlen(items)==0)) { 378 log_data_err("FAIL: %s should return %s from isEmpty (Are you missing data?)\n", 379 pat, 380 strlen(items)==0 ? "TRUE" : "FALSE"); 381 } 382 383 /* Don't test patterns starting with "[^" */ 384 if (u_strlen(ustr) > 2 && ustr[1] == 0x5e /*'^'*/) { 385 return; 386 } 387 388 while (*p) { 389 390 ++expectedSize; 391 392 if (start > end || start == -1) { 393 /* Fetch our next item */ 394 if (itemIndex >= itemCount) { 395 log_data_err("FAIL: ran out of items iterating %s (Are you missing data?)\n", pat); 396 return; 397 } 398 399 itemLen = uset_getItem(set, itemIndex, &start, &end, 400 itemStr, sizeof(itemStr), &ec); 401 if (U_FAILURE(ec) || itemLen < 0) { 402 log_err("FAIL: uset_getItem => %s\n", u_errorName(ec)); 403 return; 404 } 405 406 if (itemLen == 0) { 407 log_verbose("Ok: %s item %d is %c-%c\n", pat, 408 itemIndex, oneUCharToChar(start), 409 oneUCharToChar(end)); 410 } else { 411 itemStr[itemLen] = 0; 412 u_UCharsToChars(itemStr, buf, itemLen+1); 413 log_verbose("Ok: %s item %d is \"%s\"\n", pat, itemIndex, buf); 414 } 415 416 ++itemIndex; 417 } 418 419 if (*p=='{') { 420 const char* stringStart = ++p; 421 int32_t stringLength = 0; 422 char strCopy[64]; 423 424 while (*p++ != '}') { 425 } 426 stringLength = (int32_t)(p - stringStart - 1); 427 strncpy(strCopy, stringStart, stringLength); 428 strCopy[stringLength] = 0; 429 430 u_charsToUChars(stringStart, ustr, stringLength); 431 ustr[stringLength] = 0; 432 433 if (itemLen == 0) { 434 log_err("FAIL: for %s expect \"%s\" next, but got a char\n", 435 pat, strCopy); 436 return; 437 } 438 439 if (u_strcmp(ustr, itemStr) != 0) { 440 log_err("FAIL: for %s expect \"%s\" next\n", 441 pat, strCopy); 442 return; 443 } 444 } 445 446 else { 447 UChar32 c; 448 449 u_charsToUChars(p, ustr, 1); 450 c = ustr[0]; 451 452 if (itemLen != 0) { 453 log_err("FAIL: for %s expect '%c' next, but got a string\n", 454 pat, *p); 455 return; 456 } 457 458 if (c != start++) { 459 log_err("FAIL: for %s expect '%c' next\n", 460 pat, *p); 461 return; 462 } 463 464 ++p; 465 } 466 } 467 468 if (uset_size(set) == expectedSize) { 469 log_verbose("Ok: %s size is %d\n", pat, expectedSize); 470 } else { 471 log_err("FAIL: %s size is %d, expected %d\n", 472 pat, uset_size(set), expectedSize); 473 } 474 } 475 476 static void 477 TestSerialized() { 478 uint16_t buffer[1000]; 479 USerializedSet sset; 480 USet *set; 481 UErrorCode errorCode; 482 UChar32 c; 483 int32_t length; 484 485 /* use a pattern that generates both BMP and supplementary code points */ 486 U_STRING_DECL(pattern, "[:Cf:]", 6); 487 U_STRING_INIT(pattern, "[:Cf:]", 6); 488 489 errorCode=U_ZERO_ERROR; 490 set=uset_openPattern(pattern, -1, &errorCode); 491 if(U_FAILURE(errorCode)) { 492 log_data_err("uset_openPattern([:Cf:]) failed - %s (Are you missing data?)\n", u_errorName(errorCode)); 493 return; 494 } 495 496 length=uset_serialize(set, buffer, UPRV_LENGTHOF(buffer), &errorCode); 497 if(U_FAILURE(errorCode)) { 498 log_err("unable to uset_serialize([:Cf:]) - %s\n", u_errorName(errorCode)); 499 uset_close(set); 500 return; 501 } 502 503 uset_getSerializedSet(&sset, buffer, length); 504 for(c=0; c<=0x10ffff; ++c) { 505 if(uset_contains(set, c)!=uset_serializedContains(&sset, c)) { 506 log_err("uset_contains(U+%04x)!=uset_serializedContains(U+%04x)\n", c); 507 break; 508 } 509 } 510 511 uset_close(set); 512 } 513 514 /** 515 * Make sure that when non-invariant chars are passed to uset_openPattern 516 * they do not cause an ugly failure mode (e.g. assertion failure). 517 * JB#3795. 518 */ 519 static void 520 TestNonInvariantPattern() { 521 UErrorCode ec = U_ZERO_ERROR; 522 /* The critical part of this test is that the following pattern 523 must contain a non-invariant character. */ 524 static const char *pattern = "[:ccc!=0:]"; 525 UChar buf[256]; 526 int32_t len = u_unescape(pattern, buf, 256); 527 /* This test 'fails' by having an assertion failure within the 528 following call. It passes by running to completion with no 529 assertion failure. */ 530 USet *set = uset_openPattern(buf, len, &ec); 531 uset_close(set); 532 } 533 534 static void TestBadPattern(void) { 535 UErrorCode status = U_ZERO_ERROR; 536 USet *pat; 537 U_STRING_DECL(pattern, "[", 1); 538 U_STRING_INIT(pattern, "[", 1); 539 pat = uset_openPatternOptions(pattern, u_strlen(pattern), 0, &status); 540 if (pat != NULL || U_SUCCESS(status)) { 541 log_err("uset_openPatternOptions did not fail as expected %s\n", u_errorName(status)); 542 } 543 } 544 545 static USet *openIDSet() { 546 UErrorCode errorCode = U_ZERO_ERROR; 547 U_STRING_DECL(pattern, "[:ID_Continue:]", 15); 548 U_STRING_INIT(pattern, "[:ID_Continue:]", 15); 549 return uset_openPattern(pattern, 15, &errorCode); 550 } 551 552 static void TestFreezable() { 553 USet *idSet; 554 USet *frozen; 555 USet *thawed; 556 557 idSet=openIDSet(); 558 559 if (idSet == NULL) { 560 log_data_err("openIDSet() returned NULL. (Are you missing data?)\n"); 561 uset_close(idSet); 562 return; 563 } 564 565 frozen=uset_clone(idSet); 566 567 if (frozen == NULL) { 568 log_err("uset_Clone() returned NULL\n"); 569 return; 570 } 571 572 if(!uset_equals(frozen, idSet)) { 573 log_err("uset_clone() did not make an equal copy\n"); 574 } 575 576 uset_freeze(frozen); 577 uset_addRange(frozen, 0xd802, 0xd805); 578 579 if(uset_isFrozen(idSet) || !uset_isFrozen(frozen) || !uset_equals(frozen, idSet)) { 580 log_err("uset_freeze() or uset_isFrozen() does not work\n"); 581 } 582 583 thawed=uset_cloneAsThawed(frozen); 584 585 if (thawed == NULL) { 586 log_err("uset_cloneAsThawed(frozen) returned NULL"); 587 uset_close(frozen); 588 uset_close(idSet); 589 return; 590 } 591 592 uset_addRange(thawed, 0xd802, 0xd805); 593 594 if(uset_isFrozen(thawed) || uset_equals(thawed, idSet) || !uset_containsRange(thawed, 0xd802, 0xd805)) { 595 log_err("uset_cloneAsThawed() does not work\n"); 596 } 597 598 uset_close(idSet); 599 uset_close(frozen); 600 uset_close(thawed); 601 } 602 603 static void TestSpan() { 604 static const UChar s16[2]={ 0xe01, 0x3000 }; 605 static const char* s8="\xE0\xB8\x81\xE3\x80\x80"; 606 607 USet *idSet=openIDSet(); 608 609 if (idSet == NULL) { 610 log_data_err("openIDSet() returned NULL (Are you missing data?)\n"); 611 return; 612 } 613 614 if( 615 1!=uset_span(idSet, s16, 2, USET_SPAN_CONTAINED) || 616 0!=uset_span(idSet, s16, 2, USET_SPAN_NOT_CONTAINED) || 617 2!=uset_spanBack(idSet, s16, 2, USET_SPAN_CONTAINED) || 618 1!=uset_spanBack(idSet, s16, 2, USET_SPAN_NOT_CONTAINED) 619 ) { 620 log_err("uset_span() or uset_spanBack() does not work\n"); 621 } 622 623 if( 624 3!=uset_spanUTF8(idSet, s8, 6, USET_SPAN_CONTAINED) || 625 0!=uset_spanUTF8(idSet, s8, 6, USET_SPAN_NOT_CONTAINED) || 626 6!=uset_spanBackUTF8(idSet, s8, 6, USET_SPAN_CONTAINED) || 627 3!=uset_spanBackUTF8(idSet, s8, 6, USET_SPAN_NOT_CONTAINED) 628 ) { 629 log_err("uset_spanUTF8() or uset_spanBackUTF8() does not work\n"); 630 } 631 632 uset_freeze(idSet); 633 634 if( 635 1!=uset_span(idSet, s16, 2, USET_SPAN_CONTAINED) || 636 0!=uset_span(idSet, s16, 2, USET_SPAN_NOT_CONTAINED) || 637 2!=uset_spanBack(idSet, s16, 2, USET_SPAN_CONTAINED) || 638 1!=uset_spanBack(idSet, s16, 2, USET_SPAN_NOT_CONTAINED) 639 ) { 640 log_err("uset_span(frozen) or uset_spanBack(frozen) does not work\n"); 641 } 642 643 if( 644 3!=uset_spanUTF8(idSet, s8, 6, USET_SPAN_CONTAINED) || 645 0!=uset_spanUTF8(idSet, s8, 6, USET_SPAN_NOT_CONTAINED) || 646 6!=uset_spanBackUTF8(idSet, s8, 6, USET_SPAN_CONTAINED) || 647 3!=uset_spanBackUTF8(idSet, s8, 6, USET_SPAN_NOT_CONTAINED) 648 ) { 649 log_err("uset_spanUTF8(frozen) or uset_spanBackUTF8(frozen) does not work\n"); 650 } 651 652 uset_close(idSet); 653 } 654 655 /*eof*/ 656