1 /******************************************************************** 2 * COPYRIGHT: 3 * Copyright (c) 1997-2010, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ********************************************************************/ 6 /******************************************************************************** 7 * 8 * File CITERTST.C 9 * 10 * Modification History: 11 * Date Name Description 12 * Madhu Katragadda Ported for C API 13 * 02/19/01 synwee Modified test case for new collation iterator 14 *********************************************************************************/ 15 /* 16 * Collation Iterator tests. 17 * (Let me reiterate my position...) 18 */ 19 20 #include "unicode/utypes.h" 21 22 #if !UCONFIG_NO_COLLATION 23 24 #include "unicode/ucol.h" 25 #include "unicode/uloc.h" 26 #include "unicode/uchar.h" 27 #include "unicode/ustring.h" 28 #include "unicode/putil.h" 29 #include "callcoll.h" 30 #include "cmemory.h" 31 #include "cintltst.h" 32 #include "citertst.h" 33 #include "ccolltst.h" 34 #include "filestrm.h" 35 #include "cstring.h" 36 #include "ucol_imp.h" 37 #include "ucol_tok.h" 38 #include <stdio.h> 39 40 extern uint8_t ucol_uprv_getCaseBits(const UChar *, uint32_t, UErrorCode *); 41 42 void addCollIterTest(TestNode** root) 43 { 44 addTest(root, &TestPrevious, "tscoll/citertst/TestPrevious"); 45 addTest(root, &TestOffset, "tscoll/citertst/TestOffset"); 46 addTest(root, &TestSetText, "tscoll/citertst/TestSetText"); 47 addTest(root, &TestMaxExpansion, "tscoll/citertst/TestMaxExpansion"); 48 addTest(root, &TestUnicodeChar, "tscoll/citertst/TestUnicodeChar"); 49 addTest(root, &TestNormalizedUnicodeChar, 50 "tscoll/citertst/TestNormalizedUnicodeChar"); 51 addTest(root, &TestNormalization, "tscoll/citertst/TestNormalization"); 52 addTest(root, &TestBug672, "tscoll/citertst/TestBug672"); 53 addTest(root, &TestBug672Normalize, "tscoll/citertst/TestBug672Normalize"); 54 addTest(root, &TestSmallBuffer, "tscoll/citertst/TestSmallBuffer"); 55 addTest(root, &TestCEs, "tscoll/citertst/TestCEs"); 56 addTest(root, &TestDiscontiguos, "tscoll/citertst/TestDiscontiguos"); 57 addTest(root, &TestCEBufferOverflow, "tscoll/citertst/TestCEBufferOverflow"); 58 addTest(root, &TestCEValidity, "tscoll/citertst/TestCEValidity"); 59 addTest(root, &TestSortKeyValidity, "tscoll/citertst/TestSortKeyValidity"); 60 } 61 62 /* The locales we support */ 63 64 static const char * LOCALES[] = {"en_AU", "en_BE", "en_CA"}; 65 66 static void TestBug672() { 67 UErrorCode status = U_ZERO_ERROR; 68 UChar pattern[20]; 69 UChar text[50]; 70 int i; 71 int result[3][3]; 72 73 u_uastrcpy(pattern, "resume"); 74 u_uastrcpy(text, "Time to resume updating my resume."); 75 76 for (i = 0; i < 3; ++ i) { 77 UCollator *coll = ucol_open(LOCALES[i], &status); 78 UCollationElements *pitr = ucol_openElements(coll, pattern, -1, 79 &status); 80 UCollationElements *titer = ucol_openElements(coll, text, -1, 81 &status); 82 if (U_FAILURE(status)) { 83 log_err_status(status, "ERROR: in creation of either the collator or the collation iterator :%s\n", 84 myErrorName(status)); 85 return; 86 } 87 88 log_verbose("locale tested %s\n", LOCALES[i]); 89 90 while (ucol_next(pitr, &status) != UCOL_NULLORDER && 91 U_SUCCESS(status)) { 92 } 93 if (U_FAILURE(status)) { 94 log_err("ERROR: reversing collation iterator :%s\n", 95 myErrorName(status)); 96 return; 97 } 98 ucol_reset(pitr); 99 100 ucol_setOffset(titer, u_strlen(pattern), &status); 101 if (U_FAILURE(status)) { 102 log_err("ERROR: setting offset in collator :%s\n", 103 myErrorName(status)); 104 return; 105 } 106 result[i][0] = ucol_getOffset(titer); 107 log_verbose("Text iterator set to offset %d\n", result[i][0]); 108 109 /* Use previous() */ 110 ucol_previous(titer, &status); 111 result[i][1] = ucol_getOffset(titer); 112 log_verbose("Current offset %d after previous\n", result[i][1]); 113 114 /* Add one to index */ 115 log_verbose("Adding one to current offset...\n"); 116 ucol_setOffset(titer, ucol_getOffset(titer) + 1, &status); 117 if (U_FAILURE(status)) { 118 log_err("ERROR: setting offset in collator :%s\n", 119 myErrorName(status)); 120 return; 121 } 122 result[i][2] = ucol_getOffset(titer); 123 log_verbose("Current offset in text = %d\n", result[i][2]); 124 ucol_closeElements(pitr); 125 ucol_closeElements(titer); 126 ucol_close(coll); 127 } 128 129 if (uprv_memcmp(result[0], result[1], 3) != 0 || 130 uprv_memcmp(result[1], result[2], 3) != 0) { 131 log_err("ERROR: Different locales have different offsets at the same character\n"); 132 } 133 } 134 135 136 137 /* Running this test with normalization enabled showed up a bug in the incremental 138 normalization code. */ 139 static void TestBug672Normalize() { 140 UErrorCode status = U_ZERO_ERROR; 141 UChar pattern[20]; 142 UChar text[50]; 143 int i; 144 int result[3][3]; 145 146 u_uastrcpy(pattern, "resume"); 147 u_uastrcpy(text, "Time to resume updating my resume."); 148 149 for (i = 0; i < 3; ++ i) { 150 UCollator *coll = ucol_open(LOCALES[i], &status); 151 UCollationElements *pitr = NULL; 152 UCollationElements *titer = NULL; 153 154 ucol_setAttribute(coll, UCOL_NORMALIZATION_MODE, UCOL_ON, &status); 155 156 pitr = ucol_openElements(coll, pattern, -1, &status); 157 titer = ucol_openElements(coll, text, -1, &status); 158 if (U_FAILURE(status)) { 159 log_err_status(status, "ERROR: in creation of either the collator or the collation iterator :%s\n", 160 myErrorName(status)); 161 return; 162 } 163 164 log_verbose("locale tested %s\n", LOCALES[i]); 165 166 while (ucol_next(pitr, &status) != UCOL_NULLORDER && 167 U_SUCCESS(status)) { 168 } 169 if (U_FAILURE(status)) { 170 log_err("ERROR: reversing collation iterator :%s\n", 171 myErrorName(status)); 172 return; 173 } 174 ucol_reset(pitr); 175 176 ucol_setOffset(titer, u_strlen(pattern), &status); 177 if (U_FAILURE(status)) { 178 log_err("ERROR: setting offset in collator :%s\n", 179 myErrorName(status)); 180 return; 181 } 182 result[i][0] = ucol_getOffset(titer); 183 log_verbose("Text iterator set to offset %d\n", result[i][0]); 184 185 /* Use previous() */ 186 ucol_previous(titer, &status); 187 result[i][1] = ucol_getOffset(titer); 188 log_verbose("Current offset %d after previous\n", result[i][1]); 189 190 /* Add one to index */ 191 log_verbose("Adding one to current offset...\n"); 192 ucol_setOffset(titer, ucol_getOffset(titer) + 1, &status); 193 if (U_FAILURE(status)) { 194 log_err("ERROR: setting offset in collator :%s\n", 195 myErrorName(status)); 196 return; 197 } 198 result[i][2] = ucol_getOffset(titer); 199 log_verbose("Current offset in text = %d\n", result[i][2]); 200 ucol_closeElements(pitr); 201 ucol_closeElements(titer); 202 ucol_close(coll); 203 } 204 205 if (uprv_memcmp(result[0], result[1], 3) != 0 || 206 uprv_memcmp(result[1], result[2], 3) != 0) { 207 log_err("ERROR: Different locales have different offsets at the same character\n"); 208 } 209 } 210 211 212 213 214 /** 215 * Test for CollationElementIterator previous and next for the whole set of 216 * unicode characters. 217 */ 218 static void TestUnicodeChar() 219 { 220 UChar source[0x100]; 221 UCollator *en_us; 222 UCollationElements *iter; 223 UErrorCode status = U_ZERO_ERROR; 224 UChar codepoint; 225 226 UChar *test; 227 en_us = ucol_open("en_US", &status); 228 if (U_FAILURE(status)){ 229 log_err_status(status, "ERROR: in creation of collation data using ucol_open()\n %s\n", 230 myErrorName(status)); 231 return; 232 } 233 234 for (codepoint = 1; codepoint < 0xFFFE;) 235 { 236 test = source; 237 238 while (codepoint % 0xFF != 0) 239 { 240 if (u_isdefined(codepoint)) 241 *(test ++) = codepoint; 242 codepoint ++; 243 } 244 245 if (u_isdefined(codepoint)) 246 *(test ++) = codepoint; 247 248 if (codepoint != 0xFFFF) 249 codepoint ++; 250 251 *test = 0; 252 iter=ucol_openElements(en_us, source, u_strlen(source), &status); 253 if(U_FAILURE(status)){ 254 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 255 myErrorName(status)); 256 ucol_close(en_us); 257 return; 258 } 259 /* A basic test to see if it's working at all */ 260 log_verbose("codepoint testing %x\n", codepoint); 261 backAndForth(iter); 262 ucol_closeElements(iter); 263 264 /* null termination test */ 265 iter=ucol_openElements(en_us, source, -1, &status); 266 if(U_FAILURE(status)){ 267 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 268 myErrorName(status)); 269 ucol_close(en_us); 270 return; 271 } 272 /* A basic test to see if it's working at all */ 273 backAndForth(iter); 274 ucol_closeElements(iter); 275 } 276 277 ucol_close(en_us); 278 } 279 280 /** 281 * Test for CollationElementIterator previous and next for the whole set of 282 * unicode characters with normalization on. 283 */ 284 static void TestNormalizedUnicodeChar() 285 { 286 UChar source[0x100]; 287 UCollator *th_th; 288 UCollationElements *iter; 289 UErrorCode status = U_ZERO_ERROR; 290 UChar codepoint; 291 292 UChar *test; 293 /* thai should have normalization on */ 294 th_th = ucol_open("th_TH", &status); 295 if (U_FAILURE(status)){ 296 log_err_status(status, "ERROR: in creation of thai collation using ucol_open()\n %s\n", 297 myErrorName(status)); 298 return; 299 } 300 301 for (codepoint = 1; codepoint < 0xFFFE;) 302 { 303 test = source; 304 305 while (codepoint % 0xFF != 0) 306 { 307 if (u_isdefined(codepoint)) 308 *(test ++) = codepoint; 309 codepoint ++; 310 } 311 312 if (u_isdefined(codepoint)) 313 *(test ++) = codepoint; 314 315 if (codepoint != 0xFFFF) 316 codepoint ++; 317 318 *test = 0; 319 iter=ucol_openElements(th_th, source, u_strlen(source), &status); 320 if(U_FAILURE(status)){ 321 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 322 myErrorName(status)); 323 ucol_close(th_th); 324 return; 325 } 326 327 backAndForth(iter); 328 ucol_closeElements(iter); 329 330 iter=ucol_openElements(th_th, source, -1, &status); 331 if(U_FAILURE(status)){ 332 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 333 myErrorName(status)); 334 ucol_close(th_th); 335 return; 336 } 337 338 backAndForth(iter); 339 ucol_closeElements(iter); 340 } 341 342 ucol_close(th_th); 343 } 344 345 /** 346 * Test the incremental normalization 347 */ 348 static void TestNormalization() 349 { 350 UErrorCode status = U_ZERO_ERROR; 351 const char *str = 352 "&a < \\u0300\\u0315 < A\\u0300\\u0315 < \\u0316\\u0315B < \\u0316\\u0300\\u0315"; 353 UCollator *coll; 354 UChar rule[50]; 355 int rulelen = u_unescape(str, rule, 50); 356 int count = 0; 357 const char *testdata[] = 358 {"\\u1ED9", "o\\u0323\\u0302", 359 "\\u0300\\u0315", "\\u0315\\u0300", 360 "A\\u0300\\u0315B", "A\\u0315\\u0300B", 361 "A\\u0316\\u0315B", "A\\u0315\\u0316B", 362 "\\u0316\\u0300\\u0315", "\\u0315\\u0300\\u0316", 363 "A\\u0316\\u0300\\u0315B", "A\\u0315\\u0300\\u0316B", 364 "\\u0316\\u0315\\u0300", "A\\u0316\\u0315\\u0300B"}; 365 int32_t srclen; 366 UChar source[10]; 367 UCollationElements *iter; 368 369 coll = ucol_openRules(rule, rulelen, UCOL_ON, UCOL_TERTIARY, NULL, &status); 370 ucol_setAttribute(coll, UCOL_NORMALIZATION_MODE, UCOL_ON, &status); 371 if (U_FAILURE(status)){ 372 log_err_status(status, "ERROR: in creation of collator using ucol_openRules()\n %s\n", 373 myErrorName(status)); 374 return; 375 } 376 377 srclen = u_unescape(testdata[0], source, 10); 378 iter = ucol_openElements(coll, source, srclen, &status); 379 backAndForth(iter); 380 ucol_closeElements(iter); 381 382 srclen = u_unescape(testdata[1], source, 10); 383 iter = ucol_openElements(coll, source, srclen, &status); 384 backAndForth(iter); 385 ucol_closeElements(iter); 386 387 while (count < 12) { 388 srclen = u_unescape(testdata[count], source, 10); 389 iter = ucol_openElements(coll, source, srclen, &status); 390 391 if (U_FAILURE(status)){ 392 log_err("ERROR: in creation of collator element iterator\n %s\n", 393 myErrorName(status)); 394 return; 395 } 396 backAndForth(iter); 397 ucol_closeElements(iter); 398 399 iter = ucol_openElements(coll, source, -1, &status); 400 401 if (U_FAILURE(status)){ 402 log_err("ERROR: in creation of collator element iterator\n %s\n", 403 myErrorName(status)); 404 return; 405 } 406 backAndForth(iter); 407 ucol_closeElements(iter); 408 count ++; 409 } 410 ucol_close(coll); 411 } 412 413 /** 414 * Test for CollationElementIterator.previous() 415 * 416 * @bug 4108758 - Make sure it works with contracting characters 417 * 418 */ 419 static void TestPrevious() 420 { 421 UCollator *coll=NULL; 422 UChar rule[50]; 423 UChar *source; 424 UCollator *c1, *c2, *c3; 425 UCollationElements *iter; 426 UErrorCode status = U_ZERO_ERROR; 427 UChar test1[50]; 428 UChar test2[50]; 429 430 u_uastrcpy(test1, "What subset of all possible test cases?"); 431 u_uastrcpy(test2, "has the highest probability of detecting"); 432 coll = ucol_open("en_US", &status); 433 434 iter=ucol_openElements(coll, test1, u_strlen(test1), &status); 435 log_verbose("English locale testing back and forth\n"); 436 if(U_FAILURE(status)){ 437 log_err_status(status, "ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 438 myErrorName(status)); 439 ucol_close(coll); 440 return; 441 } 442 /* A basic test to see if it's working at all */ 443 backAndForth(iter); 444 ucol_closeElements(iter); 445 ucol_close(coll); 446 447 /* Test with a contracting character sequence */ 448 u_uastrcpy(rule, "&a,A < b,B < c,C, d,D < z,Z < ch,cH,Ch,CH"); 449 c1 = ucol_openRules(rule, u_strlen(rule), UCOL_OFF, UCOL_DEFAULT_STRENGTH, NULL, &status); 450 451 log_verbose("Contraction rule testing back and forth with no normalization\n"); 452 453 if (c1 == NULL || U_FAILURE(status)) 454 { 455 log_err("Couldn't create a RuleBasedCollator with a contracting sequence\n %s\n", 456 myErrorName(status)); 457 return; 458 } 459 source=(UChar*)malloc(sizeof(UChar) * 20); 460 u_uastrcpy(source, "abchdcba"); 461 iter=ucol_openElements(c1, source, u_strlen(source), &status); 462 if(U_FAILURE(status)){ 463 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 464 myErrorName(status)); 465 return; 466 } 467 backAndForth(iter); 468 ucol_closeElements(iter); 469 ucol_close(c1); 470 471 /* Test with an expanding character sequence */ 472 u_uastrcpy(rule, "&a < b < c/abd < d"); 473 c2 = ucol_openRules(rule, u_strlen(rule), UCOL_OFF, UCOL_DEFAULT_STRENGTH, NULL, &status); 474 log_verbose("Expansion rule testing back and forth with no normalization\n"); 475 if (c2 == NULL || U_FAILURE(status)) 476 { 477 log_err("Couldn't create a RuleBasedCollator with a contracting sequence.\n %s\n", 478 myErrorName(status)); 479 return; 480 } 481 u_uastrcpy(source, "abcd"); 482 iter=ucol_openElements(c2, source, u_strlen(source), &status); 483 if(U_FAILURE(status)){ 484 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 485 myErrorName(status)); 486 return; 487 } 488 backAndForth(iter); 489 ucol_closeElements(iter); 490 ucol_close(c2); 491 /* Now try both */ 492 u_uastrcpy(rule, "&a < b < c/aba < d < z < ch"); 493 c3 = ucol_openRules(rule, u_strlen(rule), UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH,NULL, &status); 494 log_verbose("Expansion/contraction rule testing back and forth with no normalization\n"); 495 496 if (c3 == NULL || U_FAILURE(status)) 497 { 498 log_err("Couldn't create a RuleBasedCollator with a contracting sequence.\n %s\n", 499 myErrorName(status)); 500 return; 501 } 502 u_uastrcpy(source, "abcdbchdc"); 503 iter=ucol_openElements(c3, source, u_strlen(source), &status); 504 if(U_FAILURE(status)){ 505 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 506 myErrorName(status)); 507 return; 508 } 509 backAndForth(iter); 510 ucol_closeElements(iter); 511 ucol_close(c3); 512 source[0] = 0x0e41; 513 source[1] = 0x0e02; 514 source[2] = 0x0e41; 515 source[3] = 0x0e02; 516 source[4] = 0x0e27; 517 source[5] = 0x61; 518 source[6] = 0x62; 519 source[7] = 0x63; 520 source[8] = 0; 521 522 coll = ucol_open("th_TH", &status); 523 log_verbose("Thai locale testing back and forth with normalization\n"); 524 iter=ucol_openElements(coll, source, u_strlen(source), &status); 525 if(U_FAILURE(status)){ 526 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 527 myErrorName(status)); 528 return; 529 } 530 backAndForth(iter); 531 ucol_closeElements(iter); 532 ucol_close(coll); 533 534 /* prev test */ 535 source[0] = 0x0061; 536 source[1] = 0x30CF; 537 source[2] = 0x3099; 538 source[3] = 0x30FC; 539 source[4] = 0; 540 541 coll = ucol_open("ja_JP", &status); 542 log_verbose("Japanese locale testing back and forth with normalization\n"); 543 iter=ucol_openElements(coll, source, u_strlen(source), &status); 544 if(U_FAILURE(status)){ 545 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 546 myErrorName(status)); 547 return; 548 } 549 backAndForth(iter); 550 ucol_closeElements(iter); 551 ucol_close(coll); 552 553 free(source); 554 } 555 556 /** 557 * Test for getOffset() and setOffset() 558 */ 559 static void TestOffset() 560 { 561 UErrorCode status= U_ZERO_ERROR; 562 UCollator *en_us=NULL; 563 UCollationElements *iter, *pristine; 564 int32_t offset; 565 OrderAndOffset *orders; 566 int32_t orderLength=0; 567 int count = 0; 568 UChar test1[50]; 569 UChar test2[50]; 570 571 u_uastrcpy(test1, "What subset of all possible test cases?"); 572 u_uastrcpy(test2, "has the highest probability of detecting"); 573 en_us = ucol_open("en_US", &status); 574 log_verbose("Testing getOffset and setOffset for collations\n"); 575 iter = ucol_openElements(en_us, test1, u_strlen(test1), &status); 576 if(U_FAILURE(status)){ 577 log_err_status(status, "ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 578 myErrorName(status)); 579 ucol_close(en_us); 580 return; 581 } 582 583 /* testing boundaries */ 584 ucol_setOffset(iter, 0, &status); 585 if (U_FAILURE(status) || ucol_previous(iter, &status) != UCOL_NULLORDER) { 586 log_err("Error: After setting offset to 0, we should be at the end " 587 "of the backwards iteration"); 588 } 589 ucol_setOffset(iter, u_strlen(test1), &status); 590 if (U_FAILURE(status) || ucol_next(iter, &status) != UCOL_NULLORDER) { 591 log_err("Error: After setting offset to end of the string, we should " 592 "be at the end of the backwards iteration"); 593 } 594 595 /* Run all the way through the iterator, then get the offset */ 596 597 orders = getOrders(iter, &orderLength); 598 599 offset = ucol_getOffset(iter); 600 601 if (offset != u_strlen(test1)) 602 { 603 log_err("offset at end != length %d vs %d\n", offset, 604 u_strlen(test1) ); 605 } 606 607 /* Now set the offset back to the beginning and see if it works */ 608 pristine=ucol_openElements(en_us, test1, u_strlen(test1), &status); 609 if(U_FAILURE(status)){ 610 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 611 myErrorName(status)); 612 ucol_close(en_us); 613 return; 614 } 615 status = U_ZERO_ERROR; 616 617 ucol_setOffset(iter, 0, &status); 618 if (U_FAILURE(status)) 619 { 620 log_err("setOffset failed. %s\n", myErrorName(status)); 621 } 622 else 623 { 624 assertEqual(iter, pristine); 625 } 626 627 ucol_closeElements(pristine); 628 ucol_closeElements(iter); 629 free(orders); 630 631 /* testing offsets in normalization buffer */ 632 test1[0] = 0x61; 633 test1[1] = 0x300; 634 test1[2] = 0x316; 635 test1[3] = 0x62; 636 test1[4] = 0; 637 ucol_setAttribute(en_us, UCOL_NORMALIZATION_MODE, UCOL_ON, &status); 638 iter = ucol_openElements(en_us, test1, 4, &status); 639 if(U_FAILURE(status)){ 640 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 641 myErrorName(status)); 642 ucol_close(en_us); 643 return; 644 } 645 646 count = 0; 647 while (ucol_next(iter, &status) != UCOL_NULLORDER && 648 U_SUCCESS(status)) { 649 switch (count) { 650 case 0: 651 if (ucol_getOffset(iter) != 1) { 652 log_err("ERROR: Offset of iteration should be 1\n"); 653 } 654 break; 655 case 3: 656 if (ucol_getOffset(iter) != 4) { 657 log_err("ERROR: Offset of iteration should be 4\n"); 658 } 659 break; 660 default: 661 if (ucol_getOffset(iter) != 3) { 662 log_err("ERROR: Offset of iteration should be 3\n"); 663 } 664 } 665 count ++; 666 } 667 668 ucol_reset(iter); 669 count = 0; 670 while (ucol_previous(iter, &status) != UCOL_NULLORDER && 671 U_SUCCESS(status)) { 672 switch (count) { 673 case 0: 674 case 1: 675 if (ucol_getOffset(iter) != 3) { 676 log_err("ERROR: Offset of iteration should be 3\n"); 677 } 678 break; 679 case 2: 680 if (ucol_getOffset(iter) != 1) { 681 log_err("ERROR: Offset of iteration should be 1\n"); 682 } 683 break; 684 default: 685 if (ucol_getOffset(iter) != 0) { 686 log_err("ERROR: Offset of iteration should be 0\n"); 687 } 688 } 689 count ++; 690 } 691 692 if(U_FAILURE(status)){ 693 log_err("ERROR: in iterating collation elements %s\n", 694 myErrorName(status)); 695 } 696 697 ucol_closeElements(iter); 698 ucol_close(en_us); 699 } 700 701 /** 702 * Test for setText() 703 */ 704 static void TestSetText() 705 { 706 int32_t c,i; 707 UErrorCode status = U_ZERO_ERROR; 708 UCollator *en_us=NULL; 709 UCollationElements *iter1, *iter2; 710 UChar test1[50]; 711 UChar test2[50]; 712 713 u_uastrcpy(test1, "What subset of all possible test cases?"); 714 u_uastrcpy(test2, "has the highest probability of detecting"); 715 en_us = ucol_open("en_US", &status); 716 log_verbose("testing setText for Collation elements\n"); 717 iter1=ucol_openElements(en_us, test1, u_strlen(test1), &status); 718 if(U_FAILURE(status)){ 719 log_err_status(status, "ERROR: in creation of collation element iterator1 using ucol_openElements()\n %s\n", 720 myErrorName(status)); 721 ucol_close(en_us); 722 return; 723 } 724 iter2=ucol_openElements(en_us, test2, u_strlen(test2), &status); 725 if(U_FAILURE(status)){ 726 log_err("ERROR: in creation of collation element iterator2 using ucol_openElements()\n %s\n", 727 myErrorName(status)); 728 ucol_close(en_us); 729 return; 730 } 731 732 /* Run through the second iterator just to exercise it */ 733 c = ucol_next(iter2, &status); 734 i = 0; 735 736 while ( ++i < 10 && (c != UCOL_NULLORDER)) 737 { 738 if (U_FAILURE(status)) 739 { 740 log_err("iter2->next() returned an error. %s\n", myErrorName(status)); 741 ucol_closeElements(iter2); 742 ucol_closeElements(iter1); 743 ucol_close(en_us); 744 return; 745 } 746 747 c = ucol_next(iter2, &status); 748 } 749 750 /* Now set it to point to the same string as the first iterator */ 751 ucol_setText(iter2, test1, u_strlen(test1), &status); 752 if (U_FAILURE(status)) 753 { 754 log_err("call to iter2->setText(test1) failed. %s\n", myErrorName(status)); 755 } 756 else 757 { 758 assertEqual(iter1, iter2); 759 } 760 761 /* Now set it to point to a null string with fake length*/ 762 ucol_setText(iter2, NULL, 2, &status); 763 if (U_FAILURE(status)) 764 { 765 log_err("call to iter2->setText(null) failed. %s\n", myErrorName(status)); 766 } 767 else 768 { 769 if (ucol_next(iter2, &status) != UCOL_NULLORDER) { 770 log_err("iter2 with null text expected to return UCOL_NULLORDER\n"); 771 } 772 } 773 774 ucol_closeElements(iter2); 775 ucol_closeElements(iter1); 776 ucol_close(en_us); 777 } 778 779 /** @bug 4108762 780 * Test for getMaxExpansion() 781 */ 782 static void TestMaxExpansion() 783 { 784 UErrorCode status = U_ZERO_ERROR; 785 UCollator *coll ;/*= ucol_open("en_US", &status);*/ 786 UChar ch = 0; 787 UChar32 unassigned = 0xEFFFD; 788 UChar supplementary[2]; 789 uint32_t index = 0; 790 UBool isError = FALSE; 791 uint32_t sorder = 0; 792 UCollationElements *iter ;/*= ucol_openElements(coll, &ch, 1, &status);*/ 793 uint32_t temporder = 0; 794 795 UChar rule[256]; 796 u_uastrcpy(rule, "&a < ab < c/aba < d < z < ch"); 797 coll = ucol_openRules(rule, u_strlen(rule), UCOL_DEFAULT, 798 UCOL_DEFAULT_STRENGTH,NULL, &status); 799 if(U_SUCCESS(status) && coll) { 800 iter = ucol_openElements(coll, &ch, 1, &status); 801 802 while (ch < 0xFFFF && U_SUCCESS(status)) { 803 int count = 1; 804 uint32_t order; 805 int32_t size = 0; 806 807 ch ++; 808 809 ucol_setText(iter, &ch, 1, &status); 810 order = ucol_previous(iter, &status); 811 812 /* thai management */ 813 if (order == 0) 814 order = ucol_previous(iter, &status); 815 816 while (U_SUCCESS(status) && 817 ucol_previous(iter, &status) != UCOL_NULLORDER) { 818 count ++; 819 } 820 821 size = ucol_getMaxExpansion(iter, order); 822 if (U_FAILURE(status) || size < count) { 823 log_err("Failure at codepoint %d, maximum expansion count < %d\n", 824 ch, count); 825 } 826 } 827 828 /* testing for exact max expansion */ 829 ch = 0; 830 while (ch < 0x61) { 831 uint32_t order; 832 int32_t size; 833 ucol_setText(iter, &ch, 1, &status); 834 order = ucol_previous(iter, &status); 835 size = ucol_getMaxExpansion(iter, order); 836 if (U_FAILURE(status) || size != 1) { 837 log_err("Failure at codepoint %d, maximum expansion count < %d\n", 838 ch, 1); 839 } 840 ch ++; 841 } 842 843 ch = 0x63; 844 ucol_setText(iter, &ch, 1, &status); 845 temporder = ucol_previous(iter, &status); 846 847 if (U_FAILURE(status) || ucol_getMaxExpansion(iter, temporder) != 3) { 848 log_err("Failure at codepoint %d, maximum expansion count != %d\n", 849 ch, 3); 850 } 851 852 ch = 0x64; 853 ucol_setText(iter, &ch, 1, &status); 854 temporder = ucol_previous(iter, &status); 855 856 if (U_FAILURE(status) || ucol_getMaxExpansion(iter, temporder) != 1) { 857 log_err("Failure at codepoint %d, maximum expansion count != %d\n", 858 ch, 3); 859 } 860 861 U16_APPEND(supplementary, index, 2, unassigned, isError); 862 ucol_setText(iter, supplementary, 2, &status); 863 sorder = ucol_previous(iter, &status); 864 865 if (U_FAILURE(status) || ucol_getMaxExpansion(iter, sorder) != 2) { 866 log_err("Failure at codepoint %d, maximum expansion count < %d\n", 867 ch, 2); 868 } 869 870 /* testing jamo */ 871 ch = 0x1165; 872 873 ucol_setText(iter, &ch, 1, &status); 874 temporder = ucol_previous(iter, &status); 875 if (U_FAILURE(status) || ucol_getMaxExpansion(iter, temporder) > 3) { 876 log_err("Failure at codepoint %d, maximum expansion count > %d\n", 877 ch, 3); 878 } 879 880 ucol_closeElements(iter); 881 ucol_close(coll); 882 883 /* testing special jamo &a<\u1160 */ 884 rule[0] = 0x26; 885 rule[1] = 0x71; 886 rule[2] = 0x3c; 887 rule[3] = 0x1165; 888 rule[4] = 0x2f; 889 rule[5] = 0x71; 890 rule[6] = 0x71; 891 rule[7] = 0x71; 892 rule[8] = 0x71; 893 rule[9] = 0; 894 895 coll = ucol_openRules(rule, u_strlen(rule), UCOL_DEFAULT, 896 UCOL_DEFAULT_STRENGTH,NULL, &status); 897 iter = ucol_openElements(coll, &ch, 1, &status); 898 899 temporder = ucol_previous(iter, &status); 900 if (U_FAILURE(status) || ucol_getMaxExpansion(iter, temporder) != 6) { 901 log_err("Failure at codepoint %d, maximum expansion count > %d\n", 902 ch, 5); 903 } 904 905 ucol_closeElements(iter); 906 ucol_close(coll); 907 } else { 908 log_err_status(status, "Couldn't open collator -> %s\n", u_errorName(status)); 909 } 910 911 } 912 913 914 static void assertEqual(UCollationElements *i1, UCollationElements *i2) 915 { 916 int32_t c1, c2; 917 int32_t count = 0; 918 UErrorCode status = U_ZERO_ERROR; 919 920 do 921 { 922 c1 = ucol_next(i1, &status); 923 c2 = ucol_next(i2, &status); 924 925 if (c1 != c2) 926 { 927 log_err("Error in iteration %d assetEqual between\n %d and %d, they are not equal\n", count, c1, c2); 928 break; 929 } 930 931 count += 1; 932 } 933 while (c1 != UCOL_NULLORDER); 934 } 935 936 /** 937 * Testing iterators with extremely small buffers 938 */ 939 static void TestSmallBuffer() 940 { 941 UErrorCode status = U_ZERO_ERROR; 942 UCollator *coll; 943 UCollationElements *testiter, 944 *iter; 945 int32_t count = 0; 946 OrderAndOffset *testorders, 947 *orders; 948 949 UChar teststr[500]; 950 UChar str[] = {0x300, 0x31A, 0}; 951 /* 952 creating a long string of decomposable characters, 953 since by default the writable buffer is of size 256 954 */ 955 while (count < 500) { 956 if ((count & 1) == 0) { 957 teststr[count ++] = 0x300; 958 } 959 else { 960 teststr[count ++] = 0x31A; 961 } 962 } 963 964 coll = ucol_open("th_TH", &status); 965 if(U_SUCCESS(status) && coll) { 966 testiter = ucol_openElements(coll, teststr, 500, &status); 967 iter = ucol_openElements(coll, str, 2, &status); 968 969 orders = getOrders(iter, &count); 970 if (count != 2) { 971 log_err("Error collation elements size is not 2 for \\u0300\\u031A\n"); 972 } 973 974 /* 975 this will rearrange the string data to 250 characters of 0x300 first then 976 250 characters of 0x031A 977 */ 978 testorders = getOrders(testiter, &count); 979 980 if (count != 500) { 981 log_err("Error decomposition does not give the right sized collation elements\n"); 982 } 983 984 while (count != 0) { 985 /* UCA collation element for 0x0F76 */ 986 if ((count > 250 && testorders[-- count].order != orders[1].order) || 987 (count <= 250 && testorders[-- count].order != orders[0].order)) { 988 log_err("Error decomposition does not give the right collation element at %d count\n", count); 989 break; 990 } 991 } 992 993 free(testorders); 994 free(orders); 995 996 ucol_reset(testiter); 997 998 /* ensures closing of elements done properly to clear writable buffer */ 999 ucol_next(testiter, &status); 1000 ucol_next(testiter, &status); 1001 ucol_closeElements(testiter); 1002 ucol_closeElements(iter); 1003 ucol_close(coll); 1004 } else { 1005 log_err_status(status, "Couldn't open collator -> %s\n", u_errorName(status)); 1006 } 1007 } 1008 1009 /** 1010 * Sniplets of code from genuca 1011 */ 1012 static int32_t hex2num(char hex) { 1013 if(hex>='0' && hex <='9') { 1014 return hex-'0'; 1015 } else if(hex>='a' && hex<='f') { 1016 return hex-'a'+10; 1017 } else if(hex>='A' && hex<='F') { 1018 return hex-'A'+10; 1019 } else { 1020 return 0; 1021 } 1022 } 1023 1024 /** 1025 * Getting codepoints from a string 1026 * @param str character string contain codepoints seperated by space and ended 1027 * by a semicolon 1028 * @param codepoints array for storage, assuming size > 5 1029 * @return position at the end of the codepoint section 1030 */ 1031 static char * getCodePoints(char *str, UChar *codepoints, UChar *contextCPs) { 1032 char *pStartCP = str; 1033 char *pEndCP = str + 4; 1034 1035 *codepoints = (UChar)((hex2num(*pStartCP) << 12) | 1036 (hex2num(*(pStartCP + 1)) << 8) | 1037 (hex2num(*(pStartCP + 2)) << 4) | 1038 (hex2num(*(pStartCP + 3)))); 1039 if (*pEndCP == '|' || *(pEndCP+1) == '|') { 1040 /* pre-context rule */ 1041 pStartCP = pEndCP; 1042 while (*pStartCP==' ' || *pStartCP== '|' ) { 1043 pStartCP++; 1044 } 1045 pEndCP = pStartCP+4; 1046 *contextCPs = *codepoints; 1047 *(++codepoints) = (UChar)((hex2num(*pStartCP) << 12) | 1048 (hex2num(*(pStartCP + 1)) << 8) | 1049 (hex2num(*(pStartCP + 2)) << 4) | 1050 (hex2num(*(pStartCP + 3)))); 1051 contextCPs++; 1052 } 1053 *contextCPs = 0; 1054 codepoints ++; 1055 while (*pEndCP != ';') { 1056 pStartCP = pEndCP + 1; 1057 *codepoints = (UChar)((hex2num(*pStartCP) << 12) | 1058 (hex2num(*(pStartCP + 1)) << 8) | 1059 (hex2num(*(pStartCP + 2)) << 4) | 1060 (hex2num(*(pStartCP + 3)))); 1061 codepoints ++; 1062 pEndCP = pStartCP + 4; 1063 } 1064 *codepoints = 0; 1065 return pEndCP + 1; 1066 } 1067 1068 /** 1069 * Sniplets of code from genuca 1070 */ 1071 static int32_t 1072 readElement(char **from, char *to, char separator, UErrorCode *status) 1073 { 1074 if (U_SUCCESS(*status)) { 1075 char buffer[1024]; 1076 int32_t i = 0; 1077 while (**from != separator) { 1078 if (**from != ' ') { 1079 *(buffer+i++) = **from; 1080 } 1081 (*from)++; 1082 } 1083 (*from)++; 1084 *(buffer + i) = 0; 1085 strcpy(to, buffer); 1086 return i/2; 1087 } 1088 1089 return 0; 1090 } 1091 1092 /** 1093 * Sniplets of code from genuca 1094 */ 1095 static uint32_t 1096 getSingleCEValue(char *primary, char *secondary, char *tertiary, 1097 UErrorCode *status) 1098 { 1099 if (U_SUCCESS(*status)) { 1100 uint32_t value = 0; 1101 char primsave = '\0'; 1102 char secsave = '\0'; 1103 char tersave = '\0'; 1104 char *primend = primary+4; 1105 char *secend = secondary+2; 1106 char *terend = tertiary+2; 1107 uint32_t primvalue; 1108 uint32_t secvalue; 1109 uint32_t tervalue; 1110 1111 if (uprv_strlen(primary) > 4) { 1112 primsave = *primend; 1113 *primend = '\0'; 1114 } 1115 1116 if (uprv_strlen(secondary) > 2) { 1117 secsave = *secend; 1118 *secend = '\0'; 1119 } 1120 1121 if (uprv_strlen(tertiary) > 2) { 1122 tersave = *terend; 1123 *terend = '\0'; 1124 } 1125 1126 primvalue = (*primary!='\0')?uprv_strtoul(primary, &primend, 16):0; 1127 secvalue = (*secondary!='\0')?uprv_strtoul(secondary, &secend, 16):0; 1128 tervalue = (*tertiary!='\0')?uprv_strtoul(tertiary, &terend, 16):0; 1129 if(primvalue <= 0xFF) { 1130 primvalue <<= 8; 1131 } 1132 1133 value = ((primvalue << UCOL_PRIMARYORDERSHIFT) & UCOL_PRIMARYORDERMASK) 1134 | ((secvalue << UCOL_SECONDARYORDERSHIFT) & UCOL_SECONDARYORDERMASK) 1135 | (tervalue & UCOL_TERTIARYORDERMASK); 1136 1137 if(primsave!='\0') { 1138 *primend = primsave; 1139 } 1140 if(secsave!='\0') { 1141 *secend = secsave; 1142 } 1143 if(tersave!='\0') { 1144 *terend = tersave; 1145 } 1146 return value; 1147 } 1148 return 0; 1149 } 1150 1151 /** 1152 * Getting collation elements generated from a string 1153 * @param str character string contain collation elements contained in [] and 1154 * seperated by space 1155 * @param ce array for storage, assuming size > 20 1156 * @param status error status 1157 * @return position at the end of the codepoint section 1158 */ 1159 static char * getCEs(char *str, uint32_t *ces, UErrorCode *status) { 1160 char *pStartCP = uprv_strchr(str, '['); 1161 int count = 0; 1162 char *pEndCP; 1163 char primary[100]; 1164 char secondary[100]; 1165 char tertiary[100]; 1166 1167 while (*pStartCP == '[') { 1168 uint32_t primarycount = 0; 1169 uint32_t secondarycount = 0; 1170 uint32_t tertiarycount = 0; 1171 uint32_t CEi = 1; 1172 pEndCP = strchr(pStartCP, ']'); 1173 if(pEndCP == NULL) { 1174 break; 1175 } 1176 pStartCP ++; 1177 1178 primarycount = readElement(&pStartCP, primary, ',', status); 1179 secondarycount = readElement(&pStartCP, secondary, ',', status); 1180 tertiarycount = readElement(&pStartCP, tertiary, ']', status); 1181 1182 /* I want to get the CEs entered right here, including continuation */ 1183 ces[count ++] = getSingleCEValue(primary, secondary, tertiary, status); 1184 if (U_FAILURE(*status)) { 1185 break; 1186 } 1187 1188 while (2 * CEi < primarycount || CEi < secondarycount || 1189 CEi < tertiarycount) { 1190 uint32_t value = UCOL_CONTINUATION_MARKER; /* Continuation marker */ 1191 if (2 * CEi < primarycount) { 1192 value |= ((hex2num(*(primary + 4 * CEi)) & 0xF) << 28); 1193 value |= ((hex2num(*(primary + 4 * CEi + 1)) & 0xF) << 24); 1194 } 1195 1196 if (2 * CEi + 1 < primarycount) { 1197 value |= ((hex2num(*(primary + 4 * CEi + 2)) & 0xF) << 20); 1198 value |= ((hex2num(*(primary + 4 * CEi + 3)) &0xF) << 16); 1199 } 1200 1201 if (CEi < secondarycount) { 1202 value |= ((hex2num(*(secondary + 2 * CEi)) & 0xF) << 12); 1203 value |= ((hex2num(*(secondary + 2 * CEi + 1)) & 0xF) << 8); 1204 } 1205 1206 if (CEi < tertiarycount) { 1207 value |= ((hex2num(*(tertiary + 2 * CEi)) & 0x3) << 4); 1208 value |= (hex2num(*(tertiary + 2 * CEi + 1)) & 0xF); 1209 } 1210 1211 CEi ++; 1212 ces[count ++] = value; 1213 } 1214 1215 pStartCP = pEndCP + 1; 1216 } 1217 ces[count] = 0; 1218 return pStartCP; 1219 } 1220 1221 /** 1222 * Getting the FractionalUCA.txt file stream 1223 */ 1224 static FileStream * getFractionalUCA(void) 1225 { 1226 char newPath[256]; 1227 char backupPath[256]; 1228 FileStream *result = NULL; 1229 1230 /* Look inside ICU_DATA first */ 1231 uprv_strcpy(newPath, ctest_dataSrcDir()); 1232 uprv_strcat(newPath, "unidata" U_FILE_SEP_STRING ); 1233 uprv_strcat(newPath, "FractionalUCA.txt"); 1234 1235 /* As a fallback, try to guess where the source data was located 1236 * at the time ICU was built, and look there. 1237 */ 1238 #if defined (U_TOPSRCDIR) 1239 strcpy(backupPath, U_TOPSRCDIR U_FILE_SEP_STRING "data"); 1240 #else 1241 { 1242 UErrorCode errorCode = U_ZERO_ERROR; 1243 strcpy(backupPath, loadTestData(&errorCode)); 1244 strcat(backupPath, U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING "data"); 1245 } 1246 #endif 1247 strcat(backupPath, U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING "FractionalUCA.txt"); 1248 1249 result = T_FileStream_open(newPath, "rb"); 1250 1251 if (result == NULL) { 1252 result = T_FileStream_open(backupPath, "rb"); 1253 if (result == NULL) { 1254 log_err("Failed to open either %s or %s\n", newPath, backupPath); 1255 } 1256 } 1257 return result; 1258 } 1259 1260 /** 1261 * Testing the CEs returned by the iterator 1262 */ 1263 static void TestCEs() { 1264 FileStream *file = NULL; 1265 char line[1024]; 1266 char *str; 1267 UChar codepoints[10]; 1268 uint32_t ces[20]; 1269 UErrorCode status = U_ZERO_ERROR; 1270 UCollator *coll = ucol_open("", &status); 1271 uint32_t lineNo = 0; 1272 UChar contextCPs[5]; 1273 1274 if (U_FAILURE(status)) { 1275 log_err_status(status, "Error in opening root collator -> %s\n", u_errorName(status)); 1276 return; 1277 } 1278 1279 file = getFractionalUCA(); 1280 1281 if (file == NULL) { 1282 log_err("*** unable to open input FractionalUCA.txt file ***\n"); 1283 return; 1284 } 1285 1286 1287 while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) { 1288 int count = 0; 1289 UCollationElements *iter; 1290 int32_t preContextCeLen=0; 1291 lineNo++; 1292 /* skip this line if it is empty or a comment or is a return value 1293 or start of some variable section */ 1294 if(line[0] == 0 || line[0] == '#' || line[0] == '\n' || 1295 line[0] == 0x000D || line[0] == '[') { 1296 continue; 1297 } 1298 1299 str = getCodePoints(line, codepoints, contextCPs); 1300 1301 /* these are 'fake' codepoints in the fractional UCA, and are used just 1302 * for positioning of indirect values. They should not go through this 1303 * test. 1304 */ 1305 if(*codepoints == 0xFDD0) { 1306 continue; 1307 } 1308 if (*contextCPs != 0) { 1309 iter = ucol_openElements(coll, contextCPs, -1, &status); 1310 if (U_FAILURE(status)) { 1311 log_err("Error in opening collation elements\n"); 1312 break; 1313 } 1314 while((ces[preContextCeLen] = ucol_next(iter, &status)) != (uint32_t)UCOL_NULLORDER) { 1315 preContextCeLen++; 1316 } 1317 ucol_closeElements(iter); 1318 } 1319 1320 getCEs(str, ces+preContextCeLen, &status); 1321 if (U_FAILURE(status)) { 1322 log_err("Error in parsing collation elements in FractionalUCA.txt\n"); 1323 break; 1324 } 1325 iter = ucol_openElements(coll, codepoints, -1, &status); 1326 if (U_FAILURE(status)) { 1327 log_err("Error in opening collation elements\n"); 1328 break; 1329 } 1330 for (;;) { 1331 uint32_t ce = (uint32_t)ucol_next(iter, &status); 1332 if (ce == 0xFFFFFFFF) { 1333 ce = 0; 1334 } 1335 /* we now unconditionally reorder Thai/Lao prevowels, so this 1336 * test would fail if we don't skip here. 1337 */ 1338 if(UCOL_ISTHAIPREVOWEL(*codepoints) && ce == 0 && count == 0) { 1339 continue; 1340 } 1341 if (ce != ces[count] || U_FAILURE(status)) { 1342 log_err("Collation elements in FractionalUCA.txt and iterators do not match!\n"); 1343 break; 1344 } 1345 if (ces[count] == 0) { 1346 break; 1347 } 1348 count ++; 1349 } 1350 ucol_closeElements(iter); 1351 } 1352 1353 T_FileStream_close(file); 1354 ucol_close(coll); 1355 } 1356 1357 /** 1358 * Testing the discontigous contractions 1359 */ 1360 static void TestDiscontiguos() { 1361 const char *rulestr = 1362 "&z < AB < X\\u0300 < ABC < X\\u0300\\u0315"; 1363 UChar rule[50]; 1364 int rulelen = u_unescape(rulestr, rule, 50); 1365 const char *src[] = { 1366 "ADB", "ADBC", "A\\u0315B", "A\\u0315BC", 1367 /* base character blocked */ 1368 "XD\\u0300", "XD\\u0300\\u0315", 1369 /* non blocking combining character */ 1370 "X\\u0319\\u0300", "X\\u0319\\u0300\\u0315", 1371 /* blocking combining character */ 1372 "X\\u0314\\u0300", "X\\u0314\\u0300\\u0315", 1373 /* contraction prefix */ 1374 "ABDC", "AB\\u0315C","X\\u0300D\\u0315", "X\\u0300\\u0319\\u0315", 1375 "X\\u0300\\u031A\\u0315", 1376 /* ends not with a contraction character */ 1377 "X\\u0319\\u0300D", "X\\u0319\\u0300\\u0315D", "X\\u0300D\\u0315D", 1378 "X\\u0300\\u0319\\u0315D", "X\\u0300\\u031A\\u0315D" 1379 }; 1380 const char *tgt[] = { 1381 /* non blocking combining character */ 1382 "A D B", "A D BC", "A \\u0315 B", "A \\u0315 BC", 1383 /* base character blocked */ 1384 "X D \\u0300", "X D \\u0300\\u0315", 1385 /* non blocking combining character */ 1386 "X\\u0300 \\u0319", "X\\u0300\\u0315 \\u0319", 1387 /* blocking combining character */ 1388 "X \\u0314 \\u0300", "X \\u0314 \\u0300\\u0315", 1389 /* contraction prefix */ 1390 "AB DC", "AB \\u0315 C","X\\u0300 D \\u0315", "X\\u0300\\u0315 \\u0319", 1391 "X\\u0300 \\u031A \\u0315", 1392 /* ends not with a contraction character */ 1393 "X\\u0300 \\u0319D", "X\\u0300\\u0315 \\u0319D", "X\\u0300 D\\u0315D", 1394 "X\\u0300\\u0315 \\u0319D", "X\\u0300 \\u031A\\u0315D" 1395 }; 1396 int size = 20; 1397 UCollator *coll; 1398 UErrorCode status = U_ZERO_ERROR; 1399 int count = 0; 1400 UCollationElements *iter; 1401 UCollationElements *resultiter; 1402 1403 coll = ucol_openRules(rule, rulelen, UCOL_OFF, UCOL_DEFAULT_STRENGTH,NULL, &status); 1404 iter = ucol_openElements(coll, rule, 1, &status); 1405 resultiter = ucol_openElements(coll, rule, 1, &status); 1406 1407 if (U_FAILURE(status)) { 1408 log_err_status(status, "Error opening collation rules -> %s\n", u_errorName(status)); 1409 return; 1410 } 1411 1412 while (count < size) { 1413 UChar str[20]; 1414 UChar tstr[20]; 1415 int strLen = u_unescape(src[count], str, 20); 1416 UChar *s; 1417 1418 ucol_setText(iter, str, strLen, &status); 1419 if (U_FAILURE(status)) { 1420 log_err("Error opening collation iterator\n"); 1421 return; 1422 } 1423 1424 u_unescape(tgt[count], tstr, 20); 1425 s = tstr; 1426 1427 log_verbose("count %d\n", count); 1428 1429 for (;;) { 1430 uint32_t ce; 1431 UChar *e = u_strchr(s, 0x20); 1432 if (e == 0) { 1433 e = u_strchr(s, 0); 1434 } 1435 ucol_setText(resultiter, s, (int32_t)(e - s), &status); 1436 ce = ucol_next(resultiter, &status); 1437 if (U_FAILURE(status)) { 1438 log_err("Error manipulating collation iterator\n"); 1439 return; 1440 } 1441 while (ce != UCOL_NULLORDER) { 1442 if (ce != (uint32_t)ucol_next(iter, &status) || 1443 U_FAILURE(status)) { 1444 log_err("Discontiguos contraction test mismatch\n"); 1445 return; 1446 } 1447 ce = ucol_next(resultiter, &status); 1448 if (U_FAILURE(status)) { 1449 log_err("Error getting next collation element\n"); 1450 return; 1451 } 1452 } 1453 s = e + 1; 1454 if (*e == 0) { 1455 break; 1456 } 1457 } 1458 ucol_reset(iter); 1459 backAndForth(iter); 1460 count ++; 1461 } 1462 ucol_closeElements(resultiter); 1463 ucol_closeElements(iter); 1464 ucol_close(coll); 1465 } 1466 1467 static void TestCEBufferOverflow() 1468 { 1469 UChar str[UCOL_EXPAND_CE_BUFFER_SIZE + 1]; 1470 UErrorCode status = U_ZERO_ERROR; 1471 UChar rule[10]; 1472 UCollator *coll; 1473 UCollationElements *iter; 1474 1475 u_uastrcpy(rule, "&z < AB"); 1476 coll = ucol_openRules(rule, u_strlen(rule), UCOL_OFF, UCOL_DEFAULT_STRENGTH, NULL,&status); 1477 if (U_FAILURE(status)) { 1478 log_err_status(status, "Rule based collator not created for testing ce buffer overflow -> %s\n", u_errorName(status)); 1479 return; 1480 } 1481 1482 /* 0xDCDC is a trail surrogate hence deemed unsafe by the heuristic 1483 test. this will cause an overflow in getPrev */ 1484 str[0] = 0x0041; /* 'A' */ 1485 /*uprv_memset(str + 1, 0xE0, sizeof(UChar) * UCOL_EXPAND_CE_BUFFER_SIZE);*/ 1486 uprv_memset(str + 1, 0xDC, sizeof(UChar) * UCOL_EXPAND_CE_BUFFER_SIZE); 1487 str[UCOL_EXPAND_CE_BUFFER_SIZE] = 0x0042; /* 'B' */ 1488 iter = ucol_openElements(coll, str, UCOL_EXPAND_CE_BUFFER_SIZE + 1, 1489 &status); 1490 if (ucol_previous(iter, &status) == UCOL_NULLORDER || 1491 status == U_BUFFER_OVERFLOW_ERROR) { 1492 log_err("CE buffer should not overflow with long string of trail surrogates\n"); 1493 } 1494 ucol_closeElements(iter); 1495 ucol_close(coll); 1496 } 1497 1498 /** 1499 * Byte bounds checks. Checks if each byte in data is between upper and lower 1500 * inclusive. 1501 */ 1502 static UBool checkByteBounds(uint32_t data, char upper, char lower) 1503 { 1504 int count = 4; 1505 while (count > 0) { 1506 char b = (char)(data & 0xFF); 1507 if (b > upper || b < lower) { 1508 return FALSE; 1509 } 1510 data = data >> 8; 1511 count --; 1512 } 1513 return TRUE; 1514 } 1515 1516 /** 1517 * Determines case of the string of codepoints. 1518 * If it is a multiple codepoints it has to treated as a contraction. 1519 */ 1520 #if 0 1521 static uint8_t getCase(const UChar *s, uint32_t len) { 1522 UBool lower = FALSE; 1523 UBool upper = FALSE; 1524 UBool title = FALSE; 1525 UErrorCode status = U_ZERO_ERROR; 1526 UChar str[256]; 1527 const UChar *ps = s; 1528 1529 if (len == 0) { 1530 return UCOL_LOWER_CASE; 1531 } 1532 1533 while (len > 0) { 1534 UChar c = *ps ++; 1535 1536 if (u_islower(c)) { 1537 lower = TRUE; 1538 } 1539 if (u_isupper(c)) { 1540 upper = TRUE; 1541 } 1542 if (u_istitle(c)) { 1543 title = TRUE; 1544 } 1545 1546 len --; 1547 } 1548 if ((lower && !upper && !title) || (!lower && !upper && !title)){ 1549 return UCOL_LOWER_CASE; 1550 } 1551 if (upper && !lower && !title) { 1552 return UCOL_UPPER_CASE; 1553 } 1554 /* mix of cases here */ 1555 /* len = unorm_normalize(s, len, UNORM_NFKD, 0, str, 256, &status); 1556 if (U_FAILURE(status)) { 1557 log_err("Error normalizing data string\n"); 1558 return UCOL_LOWER_CASE; 1559 }*/ 1560 1561 if ((title && len >= 2) || (lower && upper)) { 1562 return UCOL_MIXED_CASE; 1563 } 1564 if (u_isupper(s[0])) { 1565 return UCOL_UPPER_CASE; 1566 } 1567 return UCOL_LOWER_CASE; 1568 } 1569 #endif 1570 1571 /** 1572 * Checking collation element validity given the boundary arguments. 1573 */ 1574 static UBool checkCEValidity(const UCollator *coll, const UChar *codepoints, 1575 int length, uint32_t primarymax, 1576 uint32_t secondarymax) 1577 { 1578 UErrorCode status = U_ZERO_ERROR; 1579 UCollationElements *iter = ucol_openElements(coll, codepoints, length, 1580 &status); 1581 uint32_t ce; 1582 UBool first = TRUE; 1583 /* 1584 UBool upper = FALSE; 1585 UBool lower = FALSE; 1586 */ 1587 1588 if (U_FAILURE(status)) { 1589 log_err("Error creating iterator for testing validity\n"); 1590 } 1591 1592 ce = ucol_next(iter, &status); 1593 1594 while (ce != UCOL_NULLORDER) { 1595 if (ce != 0) { 1596 uint32_t primary = UCOL_PRIMARYORDER(ce); 1597 uint32_t secondary = UCOL_SECONDARYORDER(ce); 1598 uint32_t tertiary = UCOL_TERTIARYORDER(ce); 1599 /* uint32_t scasebits = tertiary & 0xC0;*/ 1600 1601 if ((tertiary == 0 && secondary != 0) || 1602 (tertiary < 0xC0 && secondary == 0 && primary != 0)) { 1603 /* n-1th level is not zero when the nth level is 1604 except for continuations, this is wrong */ 1605 log_err("Lower level weight not 0 when high level weight is 0\n"); 1606 goto fail; 1607 } 1608 else { 1609 /* checks if any byte is illegal ie = 01 02 03. */ 1610 if (checkByteBounds(ce, 0x3, 0x1)) { 1611 log_err("Byte range in CE lies in illegal bounds 0x1 - 0x3\n"); 1612 goto fail; 1613 } 1614 } 1615 if ((primary != 0 && primary < primarymax) 1616 || ((primary & 0xFF) == 0xFF) || (((primary>>8) & 0xFF) == 0xFF) 1617 || ((primary & 0xFF) && ((primary & 0xFF) <= 2)) 1618 || (((primary>>8) & 0xFF) && ((primary>>8) & 0xFF) <= 2) 1619 || (primary >= 0xFE00 && !isContinuation(ce))) { 1620 log_err("UCA primary weight out of bounds: %04X for string starting with %04X\n", 1621 primary, codepoints[0]); 1622 goto fail; 1623 } 1624 /* case matching not done since data generated by ken */ 1625 if (first) { 1626 if (secondary >= 6 && secondary <= secondarymax) { 1627 log_err("Secondary weight out of range\n"); 1628 goto fail; 1629 } 1630 first = FALSE; 1631 } 1632 } 1633 ce = ucol_next(iter, &status); 1634 } 1635 ucol_closeElements(iter); 1636 return TRUE; 1637 fail : 1638 ucol_closeElements(iter); 1639 return FALSE; 1640 } 1641 1642 static void TestCEValidity() 1643 { 1644 /* testing UCA collation elements */ 1645 UErrorCode status = U_ZERO_ERROR; 1646 /* en_US has no tailorings */ 1647 UCollator *coll = ucol_open("root", &status); 1648 /* tailored locales */ 1649 char locale[][11] = {"fr_FR", "ko_KR", "sh_YU", "th_TH", "zh_CN", "zh__PINYIN"}; 1650 const char *loc; 1651 FileStream *file = NULL; 1652 char line[1024]; 1653 UChar codepoints[10]; 1654 int count = 0; 1655 int maxCount = 0; 1656 UChar contextCPs[3]; 1657 UChar32 c; 1658 UParseError parseError; 1659 if (U_FAILURE(status)) { 1660 log_err_status(status, "en_US collator creation failed -> %s\n", u_errorName(status)); 1661 return; 1662 } 1663 log_verbose("Testing UCA elements\n"); 1664 file = getFractionalUCA(); 1665 if (file == NULL) { 1666 log_err("Fractional UCA data can not be opened\n"); 1667 return; 1668 } 1669 1670 while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) { 1671 if(line[0] == 0 || line[0] == '#' || line[0] == '\n' || 1672 line[0] == 0x000D || line[0] == '[') { 1673 continue; 1674 } 1675 1676 getCodePoints(line, codepoints, contextCPs); 1677 checkCEValidity(coll, codepoints, u_strlen(codepoints), 5, 86); 1678 } 1679 1680 log_verbose("Testing UCA elements for the whole range of unicode characters\n"); 1681 for (c = 0; c <= 0xffff; ++c) { 1682 if (u_isdefined(c)) { 1683 codepoints[0] = (UChar)c; 1684 checkCEValidity(coll, codepoints, 1, 5, 86); 1685 } 1686 } 1687 for (; c <= 0x10ffff; ++c) { 1688 if (u_isdefined(c)) { 1689 int32_t i = 0; 1690 U16_APPEND_UNSAFE(codepoints, i, c); 1691 checkCEValidity(coll, codepoints, i, 5, 86); 1692 } 1693 } 1694 1695 ucol_close(coll); 1696 1697 /* testing tailored collation elements */ 1698 log_verbose("Testing tailored elements\n"); 1699 if(QUICK) { 1700 maxCount = sizeof(locale)/sizeof(locale[0]); 1701 } else { 1702 maxCount = uloc_countAvailable(); 1703 } 1704 while (count < maxCount) { 1705 const UChar *rules = NULL, 1706 *current = NULL; 1707 UChar *rulesCopy = NULL; 1708 int32_t ruleLen = 0; 1709 1710 uint32_t chOffset = 0; 1711 uint32_t chLen = 0; 1712 uint32_t exOffset = 0; 1713 uint32_t exLen = 0; 1714 uint32_t prefixOffset = 0; 1715 uint32_t prefixLen = 0; 1716 UBool startOfRules = TRUE; 1717 UColOptionSet opts; 1718 1719 UColTokenParser src; 1720 uint32_t strength = 0; 1721 uint16_t specs = 0; 1722 if(QUICK) { 1723 loc = locale[count]; 1724 } else { 1725 loc = uloc_getAvailable(count); 1726 if(!hasCollationElements(loc)) { 1727 count++; 1728 continue; 1729 } 1730 } 1731 1732 log_verbose("Testing CEs for %s\n", loc); 1733 1734 coll = ucol_open(loc, &status); 1735 if (U_FAILURE(status)) { 1736 log_err("%s collator creation failed\n", loc); 1737 return; 1738 } 1739 1740 src.opts = &opts; 1741 rules = ucol_getRules(coll, &ruleLen); 1742 1743 if (ruleLen > 0) { 1744 rulesCopy = (UChar *)malloc((ruleLen + 1745 UCOL_TOK_EXTRA_RULE_SPACE_SIZE) * sizeof(UChar)); 1746 uprv_memcpy(rulesCopy, rules, ruleLen * sizeof(UChar)); 1747 src.current = src.source = rulesCopy; 1748 src.end = rulesCopy + ruleLen; 1749 src.extraCurrent = src.end; 1750 src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE; 1751 1752 while ((current = ucol_tok_parseNextToken(&src, startOfRules, &parseError,&status)) != NULL) { 1753 strength = src.parsedToken.strength; 1754 chOffset = src.parsedToken.charsOffset; 1755 chLen = src.parsedToken.charsLen; 1756 exOffset = src.parsedToken.extensionOffset; 1757 exLen = src.parsedToken.extensionLen; 1758 prefixOffset = src.parsedToken.prefixOffset; 1759 prefixLen = src.parsedToken.prefixLen; 1760 specs = src.parsedToken.flags; 1761 1762 startOfRules = FALSE; 1763 uprv_memcpy(codepoints, src.source + chOffset, 1764 chLen * sizeof(UChar)); 1765 codepoints[chLen] = 0; 1766 checkCEValidity(coll, codepoints, chLen, 4, 85); 1767 } 1768 free(rulesCopy); 1769 } 1770 1771 ucol_close(coll); 1772 count ++; 1773 } 1774 T_FileStream_close(file); 1775 } 1776 1777 static void printSortKeyError(const UChar *codepoints, int length, 1778 uint8_t *sortkey, int sklen) 1779 { 1780 int count = 0; 1781 log_err("Sortkey not valid for "); 1782 while (length > 0) { 1783 log_err("0x%04x ", *codepoints); 1784 length --; 1785 codepoints ++; 1786 } 1787 log_err("\nSortkey : "); 1788 while (count < sklen) { 1789 log_err("0x%02x ", sortkey[count]); 1790 count ++; 1791 } 1792 log_err("\n"); 1793 } 1794 1795 /** 1796 * Checking sort key validity for all levels 1797 */ 1798 static UBool checkSortKeyValidity(UCollator *coll, 1799 const UChar *codepoints, 1800 int length) 1801 { 1802 UErrorCode status = U_ZERO_ERROR; 1803 UCollationStrength strength[5] = {UCOL_PRIMARY, UCOL_SECONDARY, 1804 UCOL_TERTIARY, UCOL_QUATERNARY, 1805 UCOL_IDENTICAL}; 1806 int strengthlen = 5; 1807 int index = 0; 1808 int caselevel = 0; 1809 1810 while (caselevel < 1) { 1811 if (caselevel == 0) { 1812 ucol_setAttribute(coll, UCOL_CASE_LEVEL, UCOL_OFF, &status); 1813 } 1814 else { 1815 ucol_setAttribute(coll, UCOL_CASE_LEVEL, UCOL_ON, &status); 1816 } 1817 1818 while (index < strengthlen) { 1819 int count01 = 0; 1820 uint32_t count = 0; 1821 uint8_t sortkey[128]; 1822 uint32_t sklen; 1823 1824 ucol_setStrength(coll, strength[index]); 1825 sklen = ucol_getSortKey(coll, codepoints, length, sortkey, 128); 1826 while (sortkey[count] != 0) { 1827 if (sortkey[count] == 2 || (sortkey[count] == 3 && count01 > 0 && index != 4)) { 1828 printSortKeyError(codepoints, length, sortkey, sklen); 1829 return FALSE; 1830 } 1831 if (sortkey[count] == 1) { 1832 count01 ++; 1833 } 1834 count ++; 1835 } 1836 1837 if (count + 1 != sklen || (count01 != index + caselevel)) { 1838 printSortKeyError(codepoints, length, sortkey, sklen); 1839 return FALSE; 1840 } 1841 index ++; 1842 } 1843 caselevel ++; 1844 } 1845 return TRUE; 1846 } 1847 1848 static void TestSortKeyValidity(void) 1849 { 1850 /* testing UCA collation elements */ 1851 UErrorCode status = U_ZERO_ERROR; 1852 /* en_US has no tailorings */ 1853 UCollator *coll = ucol_open("en_US", &status); 1854 /* tailored locales */ 1855 char locale[][6] = {"fr_FR", "ko_KR", "sh_YU", "th_TH", "zh_CN"}; 1856 FileStream *file = NULL; 1857 char line[1024]; 1858 UChar codepoints[10]; 1859 int count = 0; 1860 UChar contextCPs[5]; 1861 UParseError parseError; 1862 if (U_FAILURE(status)) { 1863 log_err_status(status, "en_US collator creation failed -> %s\n", u_errorName(status)); 1864 return; 1865 } 1866 log_verbose("Testing UCA elements\n"); 1867 file = getFractionalUCA(); 1868 if (file == NULL) { 1869 log_err("Fractional UCA data can not be opened\n"); 1870 return; 1871 } 1872 1873 while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) { 1874 if(line[0] == 0 || line[0] == '#' || line[0] == '\n' || 1875 line[0] == 0x000D || line[0] == '[') { 1876 continue; 1877 } 1878 1879 getCodePoints(line, codepoints, contextCPs); 1880 checkSortKeyValidity(coll, codepoints, u_strlen(codepoints)); 1881 } 1882 1883 log_verbose("Testing UCA elements for the whole range of unicode characters\n"); 1884 codepoints[0] = 0; 1885 1886 while (codepoints[0] < 0xFFFF) { 1887 if (u_isdefined((UChar32)codepoints[0])) { 1888 checkSortKeyValidity(coll, codepoints, 1); 1889 } 1890 codepoints[0] ++; 1891 } 1892 1893 ucol_close(coll); 1894 1895 /* testing tailored collation elements */ 1896 log_verbose("Testing tailored elements\n"); 1897 while (count < 5) { 1898 const UChar *rules = NULL, 1899 *current = NULL; 1900 UChar *rulesCopy = NULL; 1901 int32_t ruleLen = 0; 1902 1903 uint32_t chOffset = 0; 1904 uint32_t chLen = 0; 1905 uint32_t exOffset = 0; 1906 uint32_t exLen = 0; 1907 uint32_t prefixOffset = 0; 1908 uint32_t prefixLen = 0; 1909 UBool startOfRules = TRUE; 1910 UColOptionSet opts; 1911 1912 UColTokenParser src; 1913 uint32_t strength = 0; 1914 uint16_t specs = 0; 1915 1916 coll = ucol_open(locale[count], &status); 1917 if (U_FAILURE(status)) { 1918 log_err("%s collator creation failed\n", locale[count]); 1919 return; 1920 } 1921 1922 src.opts = &opts; 1923 rules = ucol_getRules(coll, &ruleLen); 1924 1925 if (ruleLen > 0) { 1926 rulesCopy = (UChar *)malloc((ruleLen + 1927 UCOL_TOK_EXTRA_RULE_SPACE_SIZE) * sizeof(UChar)); 1928 uprv_memcpy(rulesCopy, rules, ruleLen * sizeof(UChar)); 1929 src.current = src.source = rulesCopy; 1930 src.end = rulesCopy + ruleLen; 1931 src.extraCurrent = src.end; 1932 src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE; 1933 1934 while ((current = ucol_tok_parseNextToken(&src, startOfRules,&parseError, &status)) != NULL) { 1935 strength = src.parsedToken.strength; 1936 chOffset = src.parsedToken.charsOffset; 1937 chLen = src.parsedToken.charsLen; 1938 exOffset = src.parsedToken.extensionOffset; 1939 exLen = src.parsedToken.extensionLen; 1940 prefixOffset = src.parsedToken.prefixOffset; 1941 prefixLen = src.parsedToken.prefixLen; 1942 specs = src.parsedToken.flags; 1943 1944 startOfRules = FALSE; 1945 uprv_memcpy(codepoints, src.source + chOffset, 1946 chLen * sizeof(UChar)); 1947 codepoints[chLen] = 0; 1948 checkSortKeyValidity(coll, codepoints, chLen); 1949 } 1950 free(rulesCopy); 1951 } 1952 1953 ucol_close(coll); 1954 count ++; 1955 } 1956 T_FileStream_close(file); 1957 } 1958 1959 #endif /* #if !UCONFIG_NO_COLLATION */ 1960