1 /******************************************************************** 2 * COPYRIGHT: 3 * Copyright (c) 1997-2013, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ********************************************************************/ 6 /******************************************************************************** 7 * 8 * File CITERTST.C 9 * 10 * Modification History: 11 * Date Name Description 12 * Madhu Katragadda Ported for C API 13 * 02/19/01 synwee Modified test case for new collation iterator 14 *********************************************************************************/ 15 /* 16 * Collation Iterator tests. 17 * (Let me reiterate my position...) 18 */ 19 20 #include "unicode/utypes.h" 21 22 #if !UCONFIG_NO_COLLATION 23 24 #include "unicode/ucol.h" 25 #include "unicode/ucoleitr.h" 26 #include "unicode/uloc.h" 27 #include "unicode/uchar.h" 28 #include "unicode/ustring.h" 29 #include "unicode/putil.h" 30 #include "callcoll.h" 31 #include "cmemory.h" 32 #include "cintltst.h" 33 #include "citertst.h" 34 #include "ccolltst.h" 35 #include "filestrm.h" 36 #include "cstring.h" 37 #include "ucol_imp.h" 38 #include "ucol_tok.h" 39 #include "uparse.h" 40 #include <stdio.h> 41 42 extern uint8_t ucol_uprv_getCaseBits(const UChar *, uint32_t, UErrorCode *); 43 44 void addCollIterTest(TestNode** root) 45 { 46 addTest(root, &TestPrevious, "tscoll/citertst/TestPrevious"); 47 addTest(root, &TestOffset, "tscoll/citertst/TestOffset"); 48 addTest(root, &TestSetText, "tscoll/citertst/TestSetText"); 49 addTest(root, &TestMaxExpansion, "tscoll/citertst/TestMaxExpansion"); 50 addTest(root, &TestUnicodeChar, "tscoll/citertst/TestUnicodeChar"); 51 addTest(root, &TestNormalizedUnicodeChar, 52 "tscoll/citertst/TestNormalizedUnicodeChar"); 53 addTest(root, &TestNormalization, "tscoll/citertst/TestNormalization"); 54 addTest(root, &TestBug672, "tscoll/citertst/TestBug672"); 55 addTest(root, &TestBug672Normalize, "tscoll/citertst/TestBug672Normalize"); 56 addTest(root, &TestSmallBuffer, "tscoll/citertst/TestSmallBuffer"); 57 addTest(root, &TestCEs, "tscoll/citertst/TestCEs"); 58 addTest(root, &TestDiscontiguos, "tscoll/citertst/TestDiscontiguos"); 59 addTest(root, &TestCEBufferOverflow, "tscoll/citertst/TestCEBufferOverflow"); 60 addTest(root, &TestCEValidity, "tscoll/citertst/TestCEValidity"); 61 addTest(root, &TestSortKeyValidity, "tscoll/citertst/TestSortKeyValidity"); 62 addTest(root, &TestSearchCollatorElements, "tscoll/citertst/TestSearchCollatorElements"); 63 } 64 65 /* The locales we support */ 66 67 static const char * LOCALES[] = {"en_AU", "en_BE", "en_CA"}; 68 69 static void TestBug672() { 70 UErrorCode status = U_ZERO_ERROR; 71 UChar pattern[20]; 72 UChar text[50]; 73 int i; 74 int result[3][3]; 75 76 u_uastrcpy(pattern, "resume"); 77 u_uastrcpy(text, "Time to resume updating my resume."); 78 79 for (i = 0; i < 3; ++ i) { 80 UCollator *coll = ucol_open(LOCALES[i], &status); 81 UCollationElements *pitr = ucol_openElements(coll, pattern, -1, 82 &status); 83 UCollationElements *titer = ucol_openElements(coll, text, -1, 84 &status); 85 if (U_FAILURE(status)) { 86 log_err_status(status, "ERROR: in creation of either the collator or the collation iterator :%s\n", 87 myErrorName(status)); 88 return; 89 } 90 91 log_verbose("locale tested %s\n", LOCALES[i]); 92 93 while (ucol_next(pitr, &status) != UCOL_NULLORDER && 94 U_SUCCESS(status)) { 95 } 96 if (U_FAILURE(status)) { 97 log_err("ERROR: reversing collation iterator :%s\n", 98 myErrorName(status)); 99 return; 100 } 101 ucol_reset(pitr); 102 103 ucol_setOffset(titer, u_strlen(pattern), &status); 104 if (U_FAILURE(status)) { 105 log_err("ERROR: setting offset in collator :%s\n", 106 myErrorName(status)); 107 return; 108 } 109 result[i][0] = ucol_getOffset(titer); 110 log_verbose("Text iterator set to offset %d\n", result[i][0]); 111 112 /* Use previous() */ 113 ucol_previous(titer, &status); 114 result[i][1] = ucol_getOffset(titer); 115 log_verbose("Current offset %d after previous\n", result[i][1]); 116 117 /* Add one to index */ 118 log_verbose("Adding one to current offset...\n"); 119 ucol_setOffset(titer, ucol_getOffset(titer) + 1, &status); 120 if (U_FAILURE(status)) { 121 log_err("ERROR: setting offset in collator :%s\n", 122 myErrorName(status)); 123 return; 124 } 125 result[i][2] = ucol_getOffset(titer); 126 log_verbose("Current offset in text = %d\n", result[i][2]); 127 ucol_closeElements(pitr); 128 ucol_closeElements(titer); 129 ucol_close(coll); 130 } 131 132 if (uprv_memcmp(result[0], result[1], 3) != 0 || 133 uprv_memcmp(result[1], result[2], 3) != 0) { 134 log_err("ERROR: Different locales have different offsets at the same character\n"); 135 } 136 } 137 138 139 140 /* Running this test with normalization enabled showed up a bug in the incremental 141 normalization code. */ 142 static void TestBug672Normalize() { 143 UErrorCode status = U_ZERO_ERROR; 144 UChar pattern[20]; 145 UChar text[50]; 146 int i; 147 int result[3][3]; 148 149 u_uastrcpy(pattern, "resume"); 150 u_uastrcpy(text, "Time to resume updating my resume."); 151 152 for (i = 0; i < 3; ++ i) { 153 UCollator *coll = ucol_open(LOCALES[i], &status); 154 UCollationElements *pitr = NULL; 155 UCollationElements *titer = NULL; 156 157 ucol_setAttribute(coll, UCOL_NORMALIZATION_MODE, UCOL_ON, &status); 158 159 pitr = ucol_openElements(coll, pattern, -1, &status); 160 titer = ucol_openElements(coll, text, -1, &status); 161 if (U_FAILURE(status)) { 162 log_err_status(status, "ERROR: in creation of either the collator or the collation iterator :%s\n", 163 myErrorName(status)); 164 return; 165 } 166 167 log_verbose("locale tested %s\n", LOCALES[i]); 168 169 while (ucol_next(pitr, &status) != UCOL_NULLORDER && 170 U_SUCCESS(status)) { 171 } 172 if (U_FAILURE(status)) { 173 log_err("ERROR: reversing collation iterator :%s\n", 174 myErrorName(status)); 175 return; 176 } 177 ucol_reset(pitr); 178 179 ucol_setOffset(titer, u_strlen(pattern), &status); 180 if (U_FAILURE(status)) { 181 log_err("ERROR: setting offset in collator :%s\n", 182 myErrorName(status)); 183 return; 184 } 185 result[i][0] = ucol_getOffset(titer); 186 log_verbose("Text iterator set to offset %d\n", result[i][0]); 187 188 /* Use previous() */ 189 ucol_previous(titer, &status); 190 result[i][1] = ucol_getOffset(titer); 191 log_verbose("Current offset %d after previous\n", result[i][1]); 192 193 /* Add one to index */ 194 log_verbose("Adding one to current offset...\n"); 195 ucol_setOffset(titer, ucol_getOffset(titer) + 1, &status); 196 if (U_FAILURE(status)) { 197 log_err("ERROR: setting offset in collator :%s\n", 198 myErrorName(status)); 199 return; 200 } 201 result[i][2] = ucol_getOffset(titer); 202 log_verbose("Current offset in text = %d\n", result[i][2]); 203 ucol_closeElements(pitr); 204 ucol_closeElements(titer); 205 ucol_close(coll); 206 } 207 208 if (uprv_memcmp(result[0], result[1], 3) != 0 || 209 uprv_memcmp(result[1], result[2], 3) != 0) { 210 log_err("ERROR: Different locales have different offsets at the same character\n"); 211 } 212 } 213 214 215 216 217 /** 218 * Test for CollationElementIterator previous and next for the whole set of 219 * unicode characters. 220 */ 221 static void TestUnicodeChar() 222 { 223 UChar source[0x100]; 224 UCollator *en_us; 225 UCollationElements *iter; 226 UErrorCode status = U_ZERO_ERROR; 227 UChar codepoint; 228 229 UChar *test; 230 en_us = ucol_open("en_US", &status); 231 if (U_FAILURE(status)){ 232 log_err_status(status, "ERROR: in creation of collation data using ucol_open()\n %s\n", 233 myErrorName(status)); 234 return; 235 } 236 237 for (codepoint = 1; codepoint < 0xFFFE;) 238 { 239 test = source; 240 241 while (codepoint % 0xFF != 0) 242 { 243 if (u_isdefined(codepoint)) 244 *(test ++) = codepoint; 245 codepoint ++; 246 } 247 248 if (u_isdefined(codepoint)) 249 *(test ++) = codepoint; 250 251 if (codepoint != 0xFFFF) 252 codepoint ++; 253 254 *test = 0; 255 iter=ucol_openElements(en_us, source, u_strlen(source), &status); 256 if(U_FAILURE(status)){ 257 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 258 myErrorName(status)); 259 ucol_close(en_us); 260 return; 261 } 262 /* A basic test to see if it's working at all */ 263 log_verbose("codepoint testing %x\n", codepoint); 264 backAndForth(iter); 265 ucol_closeElements(iter); 266 267 /* null termination test */ 268 iter=ucol_openElements(en_us, source, -1, &status); 269 if(U_FAILURE(status)){ 270 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 271 myErrorName(status)); 272 ucol_close(en_us); 273 return; 274 } 275 /* A basic test to see if it's working at all */ 276 backAndForth(iter); 277 ucol_closeElements(iter); 278 } 279 280 ucol_close(en_us); 281 } 282 283 /** 284 * Test for CollationElementIterator previous and next for the whole set of 285 * unicode characters with normalization on. 286 */ 287 static void TestNormalizedUnicodeChar() 288 { 289 UChar source[0x100]; 290 UCollator *th_th; 291 UCollationElements *iter; 292 UErrorCode status = U_ZERO_ERROR; 293 UChar codepoint; 294 295 UChar *test; 296 /* thai should have normalization on */ 297 th_th = ucol_open("th_TH", &status); 298 if (U_FAILURE(status)){ 299 log_err_status(status, "ERROR: in creation of thai collation using ucol_open()\n %s\n", 300 myErrorName(status)); 301 return; 302 } 303 304 for (codepoint = 1; codepoint < 0xFFFE;) 305 { 306 test = source; 307 308 while (codepoint % 0xFF != 0) 309 { 310 if (u_isdefined(codepoint)) 311 *(test ++) = codepoint; 312 codepoint ++; 313 } 314 315 if (u_isdefined(codepoint)) 316 *(test ++) = codepoint; 317 318 if (codepoint != 0xFFFF) 319 codepoint ++; 320 321 *test = 0; 322 iter=ucol_openElements(th_th, source, u_strlen(source), &status); 323 if(U_FAILURE(status)){ 324 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 325 myErrorName(status)); 326 ucol_close(th_th); 327 return; 328 } 329 330 backAndForth(iter); 331 ucol_closeElements(iter); 332 333 iter=ucol_openElements(th_th, source, -1, &status); 334 if(U_FAILURE(status)){ 335 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 336 myErrorName(status)); 337 ucol_close(th_th); 338 return; 339 } 340 341 backAndForth(iter); 342 ucol_closeElements(iter); 343 } 344 345 ucol_close(th_th); 346 } 347 348 /** 349 * Test the incremental normalization 350 */ 351 static void TestNormalization() 352 { 353 UErrorCode status = U_ZERO_ERROR; 354 const char *str = 355 "&a < \\u0300\\u0315 < A\\u0300\\u0315 < \\u0316\\u0315B < \\u0316\\u0300\\u0315"; 356 UCollator *coll; 357 UChar rule[50]; 358 int rulelen = u_unescape(str, rule, 50); 359 int count = 0; 360 const char *testdata[] = 361 {"\\u1ED9", "o\\u0323\\u0302", 362 "\\u0300\\u0315", "\\u0315\\u0300", 363 "A\\u0300\\u0315B", "A\\u0315\\u0300B", 364 "A\\u0316\\u0315B", "A\\u0315\\u0316B", 365 "\\u0316\\u0300\\u0315", "\\u0315\\u0300\\u0316", 366 "A\\u0316\\u0300\\u0315B", "A\\u0315\\u0300\\u0316B", 367 "\\u0316\\u0315\\u0300", "A\\u0316\\u0315\\u0300B"}; 368 int32_t srclen; 369 UChar source[10]; 370 UCollationElements *iter; 371 372 coll = ucol_openRules(rule, rulelen, UCOL_ON, UCOL_TERTIARY, NULL, &status); 373 ucol_setAttribute(coll, UCOL_NORMALIZATION_MODE, UCOL_ON, &status); 374 if (U_FAILURE(status)){ 375 log_err_status(status, "ERROR: in creation of collator using ucol_openRules()\n %s\n", 376 myErrorName(status)); 377 return; 378 } 379 380 srclen = u_unescape(testdata[0], source, 10); 381 iter = ucol_openElements(coll, source, srclen, &status); 382 backAndForth(iter); 383 ucol_closeElements(iter); 384 385 srclen = u_unescape(testdata[1], source, 10); 386 iter = ucol_openElements(coll, source, srclen, &status); 387 backAndForth(iter); 388 ucol_closeElements(iter); 389 390 while (count < 12) { 391 srclen = u_unescape(testdata[count], source, 10); 392 iter = ucol_openElements(coll, source, srclen, &status); 393 394 if (U_FAILURE(status)){ 395 log_err("ERROR: in creation of collator element iterator\n %s\n", 396 myErrorName(status)); 397 return; 398 } 399 backAndForth(iter); 400 ucol_closeElements(iter); 401 402 iter = ucol_openElements(coll, source, -1, &status); 403 404 if (U_FAILURE(status)){ 405 log_err("ERROR: in creation of collator element iterator\n %s\n", 406 myErrorName(status)); 407 return; 408 } 409 backAndForth(iter); 410 ucol_closeElements(iter); 411 count ++; 412 } 413 ucol_close(coll); 414 } 415 416 /** 417 * Test for CollationElementIterator.previous() 418 * 419 * @bug 4108758 - Make sure it works with contracting characters 420 * 421 */ 422 static void TestPrevious() 423 { 424 UCollator *coll=NULL; 425 UChar rule[50]; 426 UChar *source; 427 UCollator *c1, *c2, *c3; 428 UCollationElements *iter; 429 UErrorCode status = U_ZERO_ERROR; 430 UChar test1[50]; 431 UChar test2[50]; 432 433 u_uastrcpy(test1, "What subset of all possible test cases?"); 434 u_uastrcpy(test2, "has the highest probability of detecting"); 435 coll = ucol_open("en_US", &status); 436 437 iter=ucol_openElements(coll, test1, u_strlen(test1), &status); 438 log_verbose("English locale testing back and forth\n"); 439 if(U_FAILURE(status)){ 440 log_err_status(status, "ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 441 myErrorName(status)); 442 ucol_close(coll); 443 return; 444 } 445 /* A basic test to see if it's working at all */ 446 backAndForth(iter); 447 ucol_closeElements(iter); 448 ucol_close(coll); 449 450 /* Test with a contracting character sequence */ 451 u_uastrcpy(rule, "&a,A < b,B < c,C, d,D < z,Z < ch,cH,Ch,CH"); 452 c1 = ucol_openRules(rule, u_strlen(rule), UCOL_OFF, UCOL_DEFAULT_STRENGTH, NULL, &status); 453 454 log_verbose("Contraction rule testing back and forth with no normalization\n"); 455 456 if (c1 == NULL || U_FAILURE(status)) 457 { 458 log_err("Couldn't create a RuleBasedCollator with a contracting sequence\n %s\n", 459 myErrorName(status)); 460 return; 461 } 462 source=(UChar*)malloc(sizeof(UChar) * 20); 463 u_uastrcpy(source, "abchdcba"); 464 iter=ucol_openElements(c1, source, u_strlen(source), &status); 465 if(U_FAILURE(status)){ 466 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 467 myErrorName(status)); 468 return; 469 } 470 backAndForth(iter); 471 ucol_closeElements(iter); 472 ucol_close(c1); 473 474 /* Test with an expanding character sequence */ 475 u_uastrcpy(rule, "&a < b < c/abd < d"); 476 c2 = ucol_openRules(rule, u_strlen(rule), UCOL_OFF, UCOL_DEFAULT_STRENGTH, NULL, &status); 477 log_verbose("Expansion rule testing back and forth with no normalization\n"); 478 if (c2 == NULL || U_FAILURE(status)) 479 { 480 log_err("Couldn't create a RuleBasedCollator with a contracting sequence.\n %s\n", 481 myErrorName(status)); 482 return; 483 } 484 u_uastrcpy(source, "abcd"); 485 iter=ucol_openElements(c2, source, u_strlen(source), &status); 486 if(U_FAILURE(status)){ 487 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 488 myErrorName(status)); 489 return; 490 } 491 backAndForth(iter); 492 ucol_closeElements(iter); 493 ucol_close(c2); 494 /* Now try both */ 495 u_uastrcpy(rule, "&a < b < c/aba < d < z < ch"); 496 c3 = ucol_openRules(rule, u_strlen(rule), UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH,NULL, &status); 497 log_verbose("Expansion/contraction rule testing back and forth with no normalization\n"); 498 499 if (c3 == NULL || U_FAILURE(status)) 500 { 501 log_err("Couldn't create a RuleBasedCollator with a contracting sequence.\n %s\n", 502 myErrorName(status)); 503 return; 504 } 505 u_uastrcpy(source, "abcdbchdc"); 506 iter=ucol_openElements(c3, source, u_strlen(source), &status); 507 if(U_FAILURE(status)){ 508 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 509 myErrorName(status)); 510 return; 511 } 512 backAndForth(iter); 513 ucol_closeElements(iter); 514 ucol_close(c3); 515 source[0] = 0x0e41; 516 source[1] = 0x0e02; 517 source[2] = 0x0e41; 518 source[3] = 0x0e02; 519 source[4] = 0x0e27; 520 source[5] = 0x61; 521 source[6] = 0x62; 522 source[7] = 0x63; 523 source[8] = 0; 524 525 coll = ucol_open("th_TH", &status); 526 log_verbose("Thai locale testing back and forth with normalization\n"); 527 iter=ucol_openElements(coll, source, u_strlen(source), &status); 528 if(U_FAILURE(status)){ 529 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 530 myErrorName(status)); 531 return; 532 } 533 backAndForth(iter); 534 ucol_closeElements(iter); 535 ucol_close(coll); 536 537 /* prev test */ 538 source[0] = 0x0061; 539 source[1] = 0x30CF; 540 source[2] = 0x3099; 541 source[3] = 0x30FC; 542 source[4] = 0; 543 544 coll = ucol_open("ja_JP", &status); 545 log_verbose("Japanese locale testing back and forth with normalization\n"); 546 iter=ucol_openElements(coll, source, u_strlen(source), &status); 547 if(U_FAILURE(status)){ 548 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 549 myErrorName(status)); 550 return; 551 } 552 backAndForth(iter); 553 ucol_closeElements(iter); 554 ucol_close(coll); 555 556 free(source); 557 } 558 559 /** 560 * Test for getOffset() and setOffset() 561 */ 562 static void TestOffset() 563 { 564 UErrorCode status= U_ZERO_ERROR; 565 UCollator *en_us=NULL; 566 UCollationElements *iter, *pristine; 567 int32_t offset; 568 OrderAndOffset *orders; 569 int32_t orderLength=0; 570 int count = 0; 571 UChar test1[50]; 572 UChar test2[50]; 573 574 u_uastrcpy(test1, "What subset of all possible test cases?"); 575 u_uastrcpy(test2, "has the highest probability of detecting"); 576 en_us = ucol_open("en_US", &status); 577 log_verbose("Testing getOffset and setOffset for collations\n"); 578 iter = ucol_openElements(en_us, test1, u_strlen(test1), &status); 579 if(U_FAILURE(status)){ 580 log_err_status(status, "ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 581 myErrorName(status)); 582 ucol_close(en_us); 583 return; 584 } 585 586 /* testing boundaries */ 587 ucol_setOffset(iter, 0, &status); 588 if (U_FAILURE(status) || ucol_previous(iter, &status) != UCOL_NULLORDER) { 589 log_err("Error: After setting offset to 0, we should be at the end " 590 "of the backwards iteration"); 591 } 592 ucol_setOffset(iter, u_strlen(test1), &status); 593 if (U_FAILURE(status) || ucol_next(iter, &status) != UCOL_NULLORDER) { 594 log_err("Error: After setting offset to end of the string, we should " 595 "be at the end of the backwards iteration"); 596 } 597 598 /* Run all the way through the iterator, then get the offset */ 599 600 orders = getOrders(iter, &orderLength); 601 602 offset = ucol_getOffset(iter); 603 604 if (offset != u_strlen(test1)) 605 { 606 log_err("offset at end != length %d vs %d\n", offset, 607 u_strlen(test1) ); 608 } 609 610 /* Now set the offset back to the beginning and see if it works */ 611 pristine=ucol_openElements(en_us, test1, u_strlen(test1), &status); 612 if(U_FAILURE(status)){ 613 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 614 myErrorName(status)); 615 ucol_close(en_us); 616 return; 617 } 618 status = U_ZERO_ERROR; 619 620 ucol_setOffset(iter, 0, &status); 621 if (U_FAILURE(status)) 622 { 623 log_err("setOffset failed. %s\n", myErrorName(status)); 624 } 625 else 626 { 627 assertEqual(iter, pristine); 628 } 629 630 ucol_closeElements(pristine); 631 ucol_closeElements(iter); 632 free(orders); 633 634 /* testing offsets in normalization buffer */ 635 test1[0] = 0x61; 636 test1[1] = 0x300; 637 test1[2] = 0x316; 638 test1[3] = 0x62; 639 test1[4] = 0; 640 ucol_setAttribute(en_us, UCOL_NORMALIZATION_MODE, UCOL_ON, &status); 641 iter = ucol_openElements(en_us, test1, 4, &status); 642 if(U_FAILURE(status)){ 643 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 644 myErrorName(status)); 645 ucol_close(en_us); 646 return; 647 } 648 649 count = 0; 650 while (ucol_next(iter, &status) != UCOL_NULLORDER && 651 U_SUCCESS(status)) { 652 switch (count) { 653 case 0: 654 if (ucol_getOffset(iter) != 1) { 655 log_err("ERROR: Offset of iteration should be 1\n"); 656 } 657 break; 658 case 3: 659 if (ucol_getOffset(iter) != 4) { 660 log_err("ERROR: Offset of iteration should be 4\n"); 661 } 662 break; 663 default: 664 if (ucol_getOffset(iter) != 3) { 665 log_err("ERROR: Offset of iteration should be 3\n"); 666 } 667 } 668 count ++; 669 } 670 671 ucol_reset(iter); 672 count = 0; 673 while (ucol_previous(iter, &status) != UCOL_NULLORDER && 674 U_SUCCESS(status)) { 675 switch (count) { 676 case 0: 677 case 1: 678 if (ucol_getOffset(iter) != 3) { 679 log_err("ERROR: Offset of iteration should be 3\n"); 680 } 681 break; 682 case 2: 683 if (ucol_getOffset(iter) != 1) { 684 log_err("ERROR: Offset of iteration should be 1\n"); 685 } 686 break; 687 default: 688 if (ucol_getOffset(iter) != 0) { 689 log_err("ERROR: Offset of iteration should be 0\n"); 690 } 691 } 692 count ++; 693 } 694 695 if(U_FAILURE(status)){ 696 log_err("ERROR: in iterating collation elements %s\n", 697 myErrorName(status)); 698 } 699 700 ucol_closeElements(iter); 701 ucol_close(en_us); 702 } 703 704 /** 705 * Test for setText() 706 */ 707 static void TestSetText() 708 { 709 int32_t c,i; 710 UErrorCode status = U_ZERO_ERROR; 711 UCollator *en_us=NULL; 712 UCollationElements *iter1, *iter2; 713 UChar test1[50]; 714 UChar test2[50]; 715 716 u_uastrcpy(test1, "What subset of all possible test cases?"); 717 u_uastrcpy(test2, "has the highest probability of detecting"); 718 en_us = ucol_open("en_US", &status); 719 log_verbose("testing setText for Collation elements\n"); 720 iter1=ucol_openElements(en_us, test1, u_strlen(test1), &status); 721 if(U_FAILURE(status)){ 722 log_err_status(status, "ERROR: in creation of collation element iterator1 using ucol_openElements()\n %s\n", 723 myErrorName(status)); 724 ucol_close(en_us); 725 return; 726 } 727 iter2=ucol_openElements(en_us, test2, u_strlen(test2), &status); 728 if(U_FAILURE(status)){ 729 log_err("ERROR: in creation of collation element iterator2 using ucol_openElements()\n %s\n", 730 myErrorName(status)); 731 ucol_close(en_us); 732 return; 733 } 734 735 /* Run through the second iterator just to exercise it */ 736 c = ucol_next(iter2, &status); 737 i = 0; 738 739 while ( ++i < 10 && (c != UCOL_NULLORDER)) 740 { 741 if (U_FAILURE(status)) 742 { 743 log_err("iter2->next() returned an error. %s\n", myErrorName(status)); 744 ucol_closeElements(iter2); 745 ucol_closeElements(iter1); 746 ucol_close(en_us); 747 return; 748 } 749 750 c = ucol_next(iter2, &status); 751 } 752 753 /* Now set it to point to the same string as the first iterator */ 754 ucol_setText(iter2, test1, u_strlen(test1), &status); 755 if (U_FAILURE(status)) 756 { 757 log_err("call to iter2->setText(test1) failed. %s\n", myErrorName(status)); 758 } 759 else 760 { 761 assertEqual(iter1, iter2); 762 } 763 764 /* Now set it to point to a null string with fake length*/ 765 ucol_setText(iter2, NULL, 2, &status); 766 if (U_FAILURE(status)) 767 { 768 log_err("call to iter2->setText(null) failed. %s\n", myErrorName(status)); 769 } 770 else 771 { 772 if (ucol_next(iter2, &status) != UCOL_NULLORDER) { 773 log_err("iter2 with null text expected to return UCOL_NULLORDER\n"); 774 } 775 } 776 777 ucol_closeElements(iter2); 778 ucol_closeElements(iter1); 779 ucol_close(en_us); 780 } 781 782 /** @bug 4108762 783 * Test for getMaxExpansion() 784 */ 785 static void TestMaxExpansion() 786 { 787 UErrorCode status = U_ZERO_ERROR; 788 UCollator *coll ;/*= ucol_open("en_US", &status);*/ 789 UChar ch = 0; 790 UChar32 unassigned = 0xEFFFD; 791 UChar supplementary[2]; 792 uint32_t stringOffset = 0; 793 UBool isError = FALSE; 794 uint32_t sorder = 0; 795 UCollationElements *iter ;/*= ucol_openElements(coll, &ch, 1, &status);*/ 796 uint32_t temporder = 0; 797 798 UChar rule[256]; 799 u_uastrcpy(rule, "&a < ab < c/aba < d < z < ch"); 800 coll = ucol_openRules(rule, u_strlen(rule), UCOL_DEFAULT, 801 UCOL_DEFAULT_STRENGTH,NULL, &status); 802 if(U_SUCCESS(status) && coll) { 803 iter = ucol_openElements(coll, &ch, 1, &status); 804 805 while (ch < 0xFFFF && U_SUCCESS(status)) { 806 int count = 1; 807 uint32_t order; 808 int32_t size = 0; 809 810 ch ++; 811 812 ucol_setText(iter, &ch, 1, &status); 813 order = ucol_previous(iter, &status); 814 815 /* thai management */ 816 if (order == 0) 817 order = ucol_previous(iter, &status); 818 819 while (U_SUCCESS(status) && 820 ucol_previous(iter, &status) != UCOL_NULLORDER) { 821 count ++; 822 } 823 824 size = ucol_getMaxExpansion(iter, order); 825 if (U_FAILURE(status) || size < count) { 826 log_err("Failure at codepoint %d, maximum expansion count < %d\n", 827 ch, count); 828 } 829 } 830 831 /* testing for exact max expansion */ 832 ch = 0; 833 while (ch < 0x61) { 834 uint32_t order; 835 int32_t size; 836 ucol_setText(iter, &ch, 1, &status); 837 order = ucol_previous(iter, &status); 838 size = ucol_getMaxExpansion(iter, order); 839 if (U_FAILURE(status) || size != 1) { 840 log_err("Failure at codepoint %d, maximum expansion count < %d\n", 841 ch, 1); 842 } 843 ch ++; 844 } 845 846 ch = 0x63; 847 ucol_setText(iter, &ch, 1, &status); 848 temporder = ucol_previous(iter, &status); 849 850 if (U_FAILURE(status) || ucol_getMaxExpansion(iter, temporder) != 3) { 851 log_err("Failure at codepoint %d, maximum expansion count != %d\n", 852 ch, 3); 853 } 854 855 ch = 0x64; 856 ucol_setText(iter, &ch, 1, &status); 857 temporder = ucol_previous(iter, &status); 858 859 if (U_FAILURE(status) || ucol_getMaxExpansion(iter, temporder) != 1) { 860 log_err("Failure at codepoint %d, maximum expansion count != %d\n", 861 ch, 3); 862 } 863 864 U16_APPEND(supplementary, stringOffset, 2, unassigned, isError); 865 (void)isError; /* Suppress set but not used warning. */ 866 ucol_setText(iter, supplementary, 2, &status); 867 sorder = ucol_previous(iter, &status); 868 869 if (U_FAILURE(status) || ucol_getMaxExpansion(iter, sorder) != 2) { 870 log_err("Failure at codepoint %d, maximum expansion count < %d\n", 871 ch, 2); 872 } 873 874 /* testing jamo */ 875 ch = 0x1165; 876 877 ucol_setText(iter, &ch, 1, &status); 878 temporder = ucol_previous(iter, &status); 879 if (U_FAILURE(status) || ucol_getMaxExpansion(iter, temporder) > 3) { 880 log_err("Failure at codepoint %d, maximum expansion count > %d\n", 881 ch, 3); 882 } 883 884 ucol_closeElements(iter); 885 ucol_close(coll); 886 887 /* testing special jamo &a<\u1160 */ 888 rule[0] = 0x26; 889 rule[1] = 0x71; 890 rule[2] = 0x3c; 891 rule[3] = 0x1165; 892 rule[4] = 0x2f; 893 rule[5] = 0x71; 894 rule[6] = 0x71; 895 rule[7] = 0x71; 896 rule[8] = 0x71; 897 rule[9] = 0; 898 899 coll = ucol_openRules(rule, u_strlen(rule), UCOL_DEFAULT, 900 UCOL_DEFAULT_STRENGTH,NULL, &status); 901 iter = ucol_openElements(coll, &ch, 1, &status); 902 903 temporder = ucol_previous(iter, &status); 904 if (U_FAILURE(status) || ucol_getMaxExpansion(iter, temporder) != 6) { 905 log_err("Failure at codepoint %d, maximum expansion count > %d\n", 906 ch, 5); 907 } 908 909 ucol_closeElements(iter); 910 ucol_close(coll); 911 } else { 912 log_err_status(status, "Couldn't open collator -> %s\n", u_errorName(status)); 913 } 914 915 } 916 917 918 static void assertEqual(UCollationElements *i1, UCollationElements *i2) 919 { 920 int32_t c1, c2; 921 int32_t count = 0; 922 UErrorCode status = U_ZERO_ERROR; 923 924 do 925 { 926 c1 = ucol_next(i1, &status); 927 c2 = ucol_next(i2, &status); 928 929 if (c1 != c2) 930 { 931 log_err("Error in iteration %d assetEqual between\n %d and %d, they are not equal\n", count, c1, c2); 932 break; 933 } 934 935 count += 1; 936 } 937 while (c1 != UCOL_NULLORDER); 938 } 939 940 /** 941 * Testing iterators with extremely small buffers 942 */ 943 static void TestSmallBuffer() 944 { 945 UErrorCode status = U_ZERO_ERROR; 946 UCollator *coll; 947 UCollationElements *testiter, 948 *iter; 949 int32_t count = 0; 950 OrderAndOffset *testorders, 951 *orders; 952 953 UChar teststr[500]; 954 UChar str[] = {0x300, 0x31A, 0}; 955 /* 956 creating a long string of decomposable characters, 957 since by default the writable buffer is of size 256 958 */ 959 while (count < 500) { 960 if ((count & 1) == 0) { 961 teststr[count ++] = 0x300; 962 } 963 else { 964 teststr[count ++] = 0x31A; 965 } 966 } 967 968 coll = ucol_open("th_TH", &status); 969 if(U_SUCCESS(status) && coll) { 970 testiter = ucol_openElements(coll, teststr, 500, &status); 971 iter = ucol_openElements(coll, str, 2, &status); 972 973 orders = getOrders(iter, &count); 974 if (count != 2) { 975 log_err("Error collation elements size is not 2 for \\u0300\\u031A\n"); 976 } 977 978 /* 979 this will rearrange the string data to 250 characters of 0x300 first then 980 250 characters of 0x031A 981 */ 982 testorders = getOrders(testiter, &count); 983 984 if (count != 500) { 985 log_err("Error decomposition does not give the right sized collation elements\n"); 986 } 987 988 while (count != 0) { 989 /* UCA collation element for 0x0F76 */ 990 if ((count > 250 && testorders[-- count].order != orders[1].order) || 991 (count <= 250 && testorders[-- count].order != orders[0].order)) { 992 log_err("Error decomposition does not give the right collation element at %d count\n", count); 993 break; 994 } 995 } 996 997 free(testorders); 998 free(orders); 999 1000 ucol_reset(testiter); 1001 1002 /* ensures closing of elements done properly to clear writable buffer */ 1003 ucol_next(testiter, &status); 1004 ucol_next(testiter, &status); 1005 ucol_closeElements(testiter); 1006 ucol_closeElements(iter); 1007 ucol_close(coll); 1008 } else { 1009 log_err_status(status, "Couldn't open collator -> %s\n", u_errorName(status)); 1010 } 1011 } 1012 1013 /** 1014 * Sniplets of code from genuca 1015 */ 1016 static int32_t hex2num(char hex) { 1017 if(hex>='0' && hex <='9') { 1018 return hex-'0'; 1019 } else if(hex>='a' && hex<='f') { 1020 return hex-'a'+10; 1021 } else if(hex>='A' && hex<='F') { 1022 return hex-'A'+10; 1023 } else { 1024 return 0; 1025 } 1026 } 1027 1028 /** 1029 * Getting codepoints from a string 1030 * @param str character string contain codepoints seperated by space and ended 1031 * by a semicolon 1032 * @param codepoints array for storage, assuming size > 5 1033 * @return position at the end of the codepoint section 1034 */ 1035 static char *getCodePoints(char *str, UChar *codepoints, UChar *contextCPs) { 1036 UErrorCode errorCode = U_ZERO_ERROR; 1037 char *semi = uprv_strchr(str, ';'); 1038 char *pipe = uprv_strchr(str, '|'); 1039 char *s; 1040 *codepoints = 0; 1041 *contextCPs = 0; 1042 if(semi == NULL) { 1043 log_err("expected semicolon after code point string in FractionalUCA.txt %s\n", str); 1044 return str; 1045 } 1046 if(pipe != NULL) { 1047 int32_t contextLength; 1048 *pipe = 0; 1049 contextLength = u_parseString(str, contextCPs, 99, NULL, &errorCode); 1050 *pipe = '|'; 1051 if(U_FAILURE(errorCode)) { 1052 log_err("error parsing precontext string from FractionalUCA.txt %s\n", str); 1053 return str; 1054 } 1055 /* prepend the precontext string to the codepoints */ 1056 u_memcpy(codepoints, contextCPs, contextLength); 1057 codepoints += contextLength; 1058 /* start of the code point string */ 1059 s = pipe + 1; 1060 } else { 1061 s = str; 1062 } 1063 u_parseString(s, codepoints, 99, NULL, &errorCode); 1064 if(U_FAILURE(errorCode)) { 1065 log_err("error parsing code point string from FractionalUCA.txt %s\n", str); 1066 return str; 1067 } 1068 return semi + 1; 1069 } 1070 1071 /** 1072 * Sniplets of code from genuca 1073 */ 1074 static int32_t 1075 readElement(char **from, char *to, char separator, UErrorCode *status) 1076 { 1077 if (U_SUCCESS(*status)) { 1078 char buffer[1024]; 1079 int32_t i = 0; 1080 while (**from != separator) { 1081 if (**from != ' ') { 1082 *(buffer+i++) = **from; 1083 } 1084 (*from)++; 1085 } 1086 (*from)++; 1087 *(buffer + i) = 0; 1088 strcpy(to, buffer); 1089 return i/2; 1090 } 1091 1092 return 0; 1093 } 1094 1095 /** 1096 * Sniplets of code from genuca 1097 */ 1098 static uint32_t 1099 getSingleCEValue(char *primary, char *secondary, char *tertiary, 1100 UErrorCode *status) 1101 { 1102 if (U_SUCCESS(*status)) { 1103 uint32_t value = 0; 1104 char primsave = '\0'; 1105 char secsave = '\0'; 1106 char tersave = '\0'; 1107 char *primend = primary+4; 1108 char *secend = secondary+2; 1109 char *terend = tertiary+2; 1110 uint32_t primvalue; 1111 uint32_t secvalue; 1112 uint32_t tervalue; 1113 1114 if (uprv_strlen(primary) > 4) { 1115 primsave = *primend; 1116 *primend = '\0'; 1117 } 1118 1119 if (uprv_strlen(secondary) > 2) { 1120 secsave = *secend; 1121 *secend = '\0'; 1122 } 1123 1124 if (uprv_strlen(tertiary) > 2) { 1125 tersave = *terend; 1126 *terend = '\0'; 1127 } 1128 1129 primvalue = (*primary!='\0')?uprv_strtoul(primary, &primend, 16):0; 1130 secvalue = (*secondary!='\0')?uprv_strtoul(secondary, &secend, 16):0; 1131 tervalue = (*tertiary!='\0')?uprv_strtoul(tertiary, &terend, 16):0; 1132 if(primvalue <= 0xFF) { 1133 primvalue <<= 8; 1134 } 1135 1136 value = ((primvalue << UCOL_PRIMARYORDERSHIFT) & UCOL_PRIMARYORDERMASK) 1137 | ((secvalue << UCOL_SECONDARYORDERSHIFT) & UCOL_SECONDARYORDERMASK) 1138 | (tervalue & UCOL_TERTIARYORDERMASK); 1139 1140 if(primsave!='\0') { 1141 *primend = primsave; 1142 } 1143 if(secsave!='\0') { 1144 *secend = secsave; 1145 } 1146 if(tersave!='\0') { 1147 *terend = tersave; 1148 } 1149 return value; 1150 } 1151 return 0; 1152 } 1153 1154 /** 1155 * Getting collation elements generated from a string 1156 * @param str character string contain collation elements contained in [] and 1157 * seperated by space 1158 * @param ce array for storage, assuming size > 20 1159 * @param status error status 1160 * @return position at the end of the codepoint section 1161 */ 1162 static char * getCEs(char *str, uint32_t *ces, UErrorCode *status) { 1163 char *pStartCP = uprv_strchr(str, '['); 1164 int count = 0; 1165 char *pEndCP; 1166 char primary[100]; 1167 char secondary[100]; 1168 char tertiary[100]; 1169 1170 while (*pStartCP == '[') { 1171 uint32_t primarycount = 0; 1172 uint32_t secondarycount = 0; 1173 uint32_t tertiarycount = 0; 1174 uint32_t CEi = 1; 1175 pEndCP = strchr(pStartCP, ']'); 1176 if(pEndCP == NULL) { 1177 break; 1178 } 1179 pStartCP ++; 1180 1181 primarycount = readElement(&pStartCP, primary, ',', status); 1182 secondarycount = readElement(&pStartCP, secondary, ',', status); 1183 tertiarycount = readElement(&pStartCP, tertiary, ']', status); 1184 1185 /* I want to get the CEs entered right here, including continuation */ 1186 ces[count ++] = getSingleCEValue(primary, secondary, tertiary, status); 1187 if (U_FAILURE(*status)) { 1188 break; 1189 } 1190 1191 while (2 * CEi < primarycount || CEi < secondarycount || 1192 CEi < tertiarycount) { 1193 uint32_t value = UCOL_CONTINUATION_MARKER; /* Continuation marker */ 1194 if (2 * CEi < primarycount) { 1195 value |= ((hex2num(*(primary + 4 * CEi)) & 0xF) << 28); 1196 value |= ((hex2num(*(primary + 4 * CEi + 1)) & 0xF) << 24); 1197 } 1198 1199 if (2 * CEi + 1 < primarycount) { 1200 value |= ((hex2num(*(primary + 4 * CEi + 2)) & 0xF) << 20); 1201 value |= ((hex2num(*(primary + 4 * CEi + 3)) &0xF) << 16); 1202 } 1203 1204 if (CEi < secondarycount) { 1205 value |= ((hex2num(*(secondary + 2 * CEi)) & 0xF) << 12); 1206 value |= ((hex2num(*(secondary + 2 * CEi + 1)) & 0xF) << 8); 1207 } 1208 1209 if (CEi < tertiarycount) { 1210 value |= ((hex2num(*(tertiary + 2 * CEi)) & 0x3) << 4); 1211 value |= (hex2num(*(tertiary + 2 * CEi + 1)) & 0xF); 1212 } 1213 1214 CEi ++; 1215 ces[count ++] = value; 1216 } 1217 1218 pStartCP = pEndCP + 1; 1219 } 1220 ces[count] = 0; 1221 return pStartCP; 1222 } 1223 1224 /** 1225 * Getting the FractionalUCA.txt file stream 1226 */ 1227 static FileStream * getFractionalUCA(void) 1228 { 1229 char newPath[256]; 1230 char backupPath[256]; 1231 FileStream *result = NULL; 1232 1233 /* Look inside ICU_DATA first */ 1234 uprv_strcpy(newPath, ctest_dataSrcDir()); 1235 uprv_strcat(newPath, "unidata" U_FILE_SEP_STRING ); 1236 uprv_strcat(newPath, "FractionalUCA.txt"); 1237 1238 /* As a fallback, try to guess where the source data was located 1239 * at the time ICU was built, and look there. 1240 */ 1241 #if defined (U_TOPSRCDIR) 1242 strcpy(backupPath, U_TOPSRCDIR U_FILE_SEP_STRING "data"); 1243 #else 1244 { 1245 UErrorCode errorCode = U_ZERO_ERROR; 1246 strcpy(backupPath, loadTestData(&errorCode)); 1247 strcat(backupPath, U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING "data"); 1248 } 1249 #endif 1250 strcat(backupPath, U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING "FractionalUCA.txt"); 1251 1252 result = T_FileStream_open(newPath, "rb"); 1253 1254 if (result == NULL) { 1255 result = T_FileStream_open(backupPath, "rb"); 1256 if (result == NULL) { 1257 log_err("Failed to open either %s or %s\n", newPath, backupPath); 1258 } 1259 } 1260 return result; 1261 } 1262 1263 /** 1264 * Testing the CEs returned by the iterator 1265 */ 1266 static void TestCEs() { 1267 FileStream *file = NULL; 1268 char line[2048]; 1269 char *str; 1270 UChar codepoints[10]; 1271 uint32_t ces[20]; 1272 UErrorCode status = U_ZERO_ERROR; 1273 UCollator *coll = ucol_open("", &status); 1274 uint32_t lineNo = 0; 1275 UChar contextCPs[5]; 1276 1277 if (U_FAILURE(status)) { 1278 log_err_status(status, "Error in opening root collator -> %s\n", u_errorName(status)); 1279 return; 1280 } 1281 1282 file = getFractionalUCA(); 1283 1284 if (file == NULL) { 1285 log_err("*** unable to open input FractionalUCA.txt file ***\n"); 1286 return; 1287 } 1288 1289 1290 while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) { 1291 int count = 0; 1292 UCollationElements *iter; 1293 int32_t preContextCeLen=0; 1294 lineNo++; 1295 /* skip this line if it is empty or a comment or is a return value 1296 or start of some variable section */ 1297 if(line[0] == 0 || line[0] == '#' || line[0] == '\n' || 1298 line[0] == 0x000D || line[0] == '[') { 1299 continue; 1300 } 1301 1302 str = getCodePoints(line, codepoints, contextCPs); 1303 1304 /* these are 'fake' codepoints in the fractional UCA, and are used just 1305 * for positioning of indirect values. They should not go through this 1306 * test. 1307 */ 1308 if(*codepoints == 0xFDD0) { 1309 continue; 1310 } 1311 if (*contextCPs != 0) { 1312 iter = ucol_openElements(coll, contextCPs, -1, &status); 1313 if (U_FAILURE(status)) { 1314 log_err("Error in opening collation elements\n"); 1315 break; 1316 } 1317 while((ces[preContextCeLen] = ucol_next(iter, &status)) != (uint32_t)UCOL_NULLORDER) { 1318 preContextCeLen++; 1319 } 1320 ucol_closeElements(iter); 1321 } 1322 1323 getCEs(str, ces+preContextCeLen, &status); 1324 if (U_FAILURE(status)) { 1325 log_err("Error in parsing collation elements in FractionalUCA.txt\n"); 1326 break; 1327 } 1328 iter = ucol_openElements(coll, codepoints, -1, &status); 1329 if (U_FAILURE(status)) { 1330 log_err("Error in opening collation elements\n"); 1331 break; 1332 } 1333 for (;;) { 1334 uint32_t ce = (uint32_t)ucol_next(iter, &status); 1335 if (ce == 0xFFFFFFFF) { 1336 ce = 0; 1337 } 1338 /* we now unconditionally reorder Thai/Lao prevowels, so this 1339 * test would fail if we don't skip here. 1340 */ 1341 if(UCOL_ISTHAIPREVOWEL(*codepoints) && ce == 0 && count == 0) { 1342 continue; 1343 } 1344 if (ce != ces[count] || U_FAILURE(status)) { 1345 log_err("Collation elements in FractionalUCA.txt and iterators do not match!\n"); 1346 break; 1347 } 1348 if (ces[count] == 0) { 1349 break; 1350 } 1351 count ++; 1352 } 1353 ucol_closeElements(iter); 1354 } 1355 1356 T_FileStream_close(file); 1357 ucol_close(coll); 1358 } 1359 1360 /** 1361 * Testing the discontigous contractions 1362 */ 1363 static void TestDiscontiguos() { 1364 const char *rulestr = 1365 "&z < AB < X\\u0300 < ABC < X\\u0300\\u0315"; 1366 UChar rule[50]; 1367 int rulelen = u_unescape(rulestr, rule, 50); 1368 const char *src[] = { 1369 "ADB", "ADBC", "A\\u0315B", "A\\u0315BC", 1370 /* base character blocked */ 1371 "XD\\u0300", "XD\\u0300\\u0315", 1372 /* non blocking combining character */ 1373 "X\\u0319\\u0300", "X\\u0319\\u0300\\u0315", 1374 /* blocking combining character */ 1375 "X\\u0314\\u0300", "X\\u0314\\u0300\\u0315", 1376 /* contraction prefix */ 1377 "ABDC", "AB\\u0315C","X\\u0300D\\u0315", "X\\u0300\\u0319\\u0315", 1378 "X\\u0300\\u031A\\u0315", 1379 /* ends not with a contraction character */ 1380 "X\\u0319\\u0300D", "X\\u0319\\u0300\\u0315D", "X\\u0300D\\u0315D", 1381 "X\\u0300\\u0319\\u0315D", "X\\u0300\\u031A\\u0315D" 1382 }; 1383 const char *tgt[] = { 1384 /* non blocking combining character */ 1385 "A D B", "A D BC", "A \\u0315 B", "A \\u0315 BC", 1386 /* base character blocked */ 1387 "X D \\u0300", "X D \\u0300\\u0315", 1388 /* non blocking combining character */ 1389 "X\\u0300 \\u0319", "X\\u0300\\u0315 \\u0319", 1390 /* blocking combining character */ 1391 "X \\u0314 \\u0300", "X \\u0314 \\u0300\\u0315", 1392 /* contraction prefix */ 1393 "AB DC", "AB \\u0315 C","X\\u0300 D \\u0315", "X\\u0300\\u0315 \\u0319", 1394 "X\\u0300 \\u031A \\u0315", 1395 /* ends not with a contraction character */ 1396 "X\\u0300 \\u0319D", "X\\u0300\\u0315 \\u0319D", "X\\u0300 D\\u0315D", 1397 "X\\u0300\\u0315 \\u0319D", "X\\u0300 \\u031A\\u0315D" 1398 }; 1399 int size = 20; 1400 UCollator *coll; 1401 UErrorCode status = U_ZERO_ERROR; 1402 int count = 0; 1403 UCollationElements *iter; 1404 UCollationElements *resultiter; 1405 1406 coll = ucol_openRules(rule, rulelen, UCOL_OFF, UCOL_DEFAULT_STRENGTH,NULL, &status); 1407 iter = ucol_openElements(coll, rule, 1, &status); 1408 resultiter = ucol_openElements(coll, rule, 1, &status); 1409 1410 if (U_FAILURE(status)) { 1411 log_err_status(status, "Error opening collation rules -> %s\n", u_errorName(status)); 1412 return; 1413 } 1414 1415 while (count < size) { 1416 UChar str[20]; 1417 UChar tstr[20]; 1418 int strLen = u_unescape(src[count], str, 20); 1419 UChar *s; 1420 1421 ucol_setText(iter, str, strLen, &status); 1422 if (U_FAILURE(status)) { 1423 log_err("Error opening collation iterator\n"); 1424 return; 1425 } 1426 1427 u_unescape(tgt[count], tstr, 20); 1428 s = tstr; 1429 1430 log_verbose("count %d\n", count); 1431 1432 for (;;) { 1433 uint32_t ce; 1434 UChar *e = u_strchr(s, 0x20); 1435 if (e == 0) { 1436 e = u_strchr(s, 0); 1437 } 1438 ucol_setText(resultiter, s, (int32_t)(e - s), &status); 1439 ce = ucol_next(resultiter, &status); 1440 if (U_FAILURE(status)) { 1441 log_err("Error manipulating collation iterator\n"); 1442 return; 1443 } 1444 while (ce != UCOL_NULLORDER) { 1445 if (ce != (uint32_t)ucol_next(iter, &status) || 1446 U_FAILURE(status)) { 1447 log_err("Discontiguos contraction test mismatch\n"); 1448 return; 1449 } 1450 ce = ucol_next(resultiter, &status); 1451 if (U_FAILURE(status)) { 1452 log_err("Error getting next collation element\n"); 1453 return; 1454 } 1455 } 1456 s = e + 1; 1457 if (*e == 0) { 1458 break; 1459 } 1460 } 1461 ucol_reset(iter); 1462 backAndForth(iter); 1463 count ++; 1464 } 1465 ucol_closeElements(resultiter); 1466 ucol_closeElements(iter); 1467 ucol_close(coll); 1468 } 1469 1470 static void TestCEBufferOverflow() 1471 { 1472 UChar str[UCOL_EXPAND_CE_BUFFER_SIZE + 1]; 1473 UErrorCode status = U_ZERO_ERROR; 1474 UChar rule[10]; 1475 UCollator *coll; 1476 UCollationElements *iter; 1477 1478 u_uastrcpy(rule, "&z < AB"); 1479 coll = ucol_openRules(rule, u_strlen(rule), UCOL_OFF, UCOL_DEFAULT_STRENGTH, NULL,&status); 1480 if (U_FAILURE(status)) { 1481 log_err_status(status, "Rule based collator not created for testing ce buffer overflow -> %s\n", u_errorName(status)); 1482 return; 1483 } 1484 1485 /* 0xDCDC is a trail surrogate hence deemed unsafe by the heuristic 1486 test. this will cause an overflow in getPrev */ 1487 str[0] = 0x0041; /* 'A' */ 1488 /*uprv_memset(str + 1, 0xE0, sizeof(UChar) * UCOL_EXPAND_CE_BUFFER_SIZE);*/ 1489 uprv_memset(str + 1, 0xDC, sizeof(UChar) * UCOL_EXPAND_CE_BUFFER_SIZE); 1490 str[UCOL_EXPAND_CE_BUFFER_SIZE] = 0x0042; /* 'B' */ 1491 iter = ucol_openElements(coll, str, UCOL_EXPAND_CE_BUFFER_SIZE + 1, 1492 &status); 1493 if (ucol_previous(iter, &status) == UCOL_NULLORDER || 1494 status == U_BUFFER_OVERFLOW_ERROR) { 1495 log_err("CE buffer should not overflow with long string of trail surrogates\n"); 1496 } 1497 ucol_closeElements(iter); 1498 ucol_close(coll); 1499 } 1500 1501 /** 1502 * Checking collation element validity. 1503 */ 1504 #define MAX_CODEPOINTS_TO_SHOW 10 1505 static void showCodepoints(const UChar *codepoints, int length, char * codepointText) { 1506 int i, lengthToUse = length; 1507 if (lengthToUse > MAX_CODEPOINTS_TO_SHOW) { 1508 lengthToUse = MAX_CODEPOINTS_TO_SHOW; 1509 } 1510 for (i = 0; i < lengthToUse; ++i) { 1511 int bytesWritten = sprintf(codepointText, " %04X", *codepoints++); 1512 if (bytesWritten <= 0) { 1513 break; 1514 } 1515 codepointText += bytesWritten; 1516 } 1517 if (i < length) { 1518 sprintf(codepointText, " ..."); 1519 } 1520 } 1521 1522 static UBool checkCEValidity(const UCollator *coll, const UChar *codepoints, 1523 int length) 1524 { 1525 UErrorCode status = U_ZERO_ERROR; 1526 UCollationElements *iter = ucol_openElements(coll, codepoints, length, 1527 &status); 1528 UBool result = FALSE; 1529 UBool primaryDone = FALSE, secondaryDone = FALSE, tertiaryDone = FALSE; 1530 const char * collLocale; 1531 1532 if (U_FAILURE(status)) { 1533 log_err("Error creating iterator for testing validity\n"); 1534 return FALSE; 1535 } 1536 collLocale = ucol_getLocale(coll, ULOC_VALID_LOCALE, &status); 1537 if (U_FAILURE(status) || collLocale==NULL) { 1538 status = U_ZERO_ERROR; 1539 collLocale = "?"; 1540 } 1541 1542 for (;;) { 1543 uint32_t ce = ucol_next(iter, &status); 1544 uint32_t primary, p1, p2, secondary, tertiary; 1545 if (ce == UCOL_NULLORDER) { 1546 result = TRUE; 1547 break; 1548 } 1549 if (ce == 0) { 1550 continue; 1551 } 1552 if (ce == 0x02000202) { 1553 /* special CE for merge-sort character */ 1554 if (*codepoints == 0xFFFE /* && length == 1 */) { 1555 /* 1556 * Note: We should check for length==1 but the token parser appears 1557 * to give us trailing NUL characters. 1558 * TODO: Ticket #8047: Change TestCEValidity to use ucol_getTailoredSet() 1559 * rather than the internal collation rule parser 1560 */ 1561 continue; 1562 } else { 1563 log_err("Special 02/02/02 weight for code point U+%04X [len %d] != U+FFFE\n", 1564 (int)*codepoints, (int)length); 1565 break; 1566 } 1567 } 1568 primary = UCOL_PRIMARYORDER(ce); 1569 p1 = primary >> 8; 1570 p2 = primary & 0xFF; 1571 secondary = UCOL_SECONDARYORDER(ce); 1572 tertiary = UCOL_TERTIARYORDER(ce) & UCOL_REMOVE_CONTINUATION; 1573 1574 if (!isContinuation(ce)) { 1575 if ((ce & UCOL_REMOVE_CONTINUATION) == 0) { 1576 log_err("Empty CE %08lX except for case bits\n", (long)ce); 1577 break; 1578 } 1579 if (p1 == 0) { 1580 if (p2 != 0) { 1581 log_err("Primary 00 xx in %08lX\n", (long)ce); 1582 break; 1583 } 1584 primaryDone = TRUE; 1585 } else { 1586 if (p1 <= 2 || p1 >= 0xF0) { 1587 /* Primary first bytes F0..FF are specials. */ 1588 log_err("Primary first byte of %08lX out of range\n", (long)ce); 1589 break; 1590 } 1591 if (p2 == 0) { 1592 primaryDone = TRUE; 1593 } else { 1594 if (p2 <= 3 || p2 >= 0xFF) { 1595 /* Primary second bytes 03 and FF are sort key compression terminators. */ 1596 log_err("Primary second byte of %08lX out of range\n", (long)ce); 1597 break; 1598 } 1599 primaryDone = FALSE; 1600 } 1601 } 1602 if (secondary == 0) { 1603 if (primary != 0) { 1604 log_err("Primary!=0 secondary==0 in %08lX\n", (long)ce); 1605 break; 1606 } 1607 secondaryDone = TRUE; 1608 } else { 1609 if (secondary <= 2 || 1610 (UCOL_BYTE_COMMON < secondary && secondary <= (UCOL_BYTE_COMMON + 0x80)) 1611 ) { 1612 /* Secondary first bytes common+1..+0x80 are used for sort key compression. */ 1613 log_err("Secondary byte of %08lX out of range\n", (long)ce); 1614 break; 1615 } 1616 secondaryDone = FALSE; 1617 } 1618 if (tertiary == 0) { 1619 /* We know that ce != 0. */ 1620 log_err("Primary!=0 or secondary!=0 but tertiary==0 in %08lX\n", (long)ce); 1621 break; 1622 } 1623 if (tertiary <= 2) { 1624 log_err("Tertiary byte of %08lX out of range\n", (long)ce); 1625 break; 1626 } 1627 tertiaryDone = FALSE; 1628 } else { 1629 if ((ce & UCOL_REMOVE_CONTINUATION) == 0) { 1630 log_err("Empty continuation %08lX\n", (long)ce); 1631 break; 1632 } 1633 if (primaryDone && primary != 0) { 1634 log_err("Primary was done but continues in %08lX\n", (long)ce); 1635 break; 1636 } 1637 if (p1 == 0) { 1638 if (p2 != 0) { 1639 log_err("Primary 00 xx in %08lX\n", (long)ce); 1640 break; 1641 } 1642 primaryDone = TRUE; 1643 } else { 1644 if (p1 <= 2) { 1645 log_err("Primary first byte of %08lX out of range\n", (long)ce); 1646 break; 1647 } 1648 if (p2 == 0) { 1649 primaryDone = TRUE; 1650 } else { 1651 if (p2 <= 3) { 1652 log_err("Primary second byte of %08lX out of range\n", (long)ce); 1653 break; 1654 } 1655 } 1656 } 1657 if (secondaryDone && secondary != 0) { 1658 log_err("Secondary was done but continues in %08lX\n", (long)ce); 1659 break; 1660 } 1661 if (secondary == 0) { 1662 secondaryDone = TRUE; 1663 } else { 1664 if (secondary <= 2) { 1665 log_err("Secondary byte of %08lX out of range\n", (long)ce); 1666 break; 1667 } 1668 } 1669 if (tertiaryDone && tertiary != 0) { 1670 log_err("Tertiary was done but continues in %08lX\n", (long)ce); 1671 break; 1672 } 1673 if (tertiary == 0) { 1674 tertiaryDone = TRUE; 1675 } else if (tertiary <= 2) { 1676 log_err("Tertiary byte of %08lX out of range\n", (long)ce); 1677 break; 1678 } 1679 } 1680 } 1681 if (!result) { 1682 char codepointText[5*MAX_CODEPOINTS_TO_SHOW + 5]; 1683 showCodepoints(codepoints, length, codepointText); 1684 log_err("Locale: %s Code point string: %s\n", collLocale, codepointText); 1685 } 1686 ucol_closeElements(iter); 1687 return result; 1688 } 1689 1690 static const UChar IMPORT[] = { 0x5B, 0x69, 0x6D, 0x70, 0x6F, 0x72, 0x74, 0 }; /* "[import" */ 1691 1692 static void TestCEValidity() 1693 { 1694 /* testing UCA collation elements */ 1695 UErrorCode status = U_ZERO_ERROR; 1696 /* en_US has no tailorings */ 1697 UCollator *coll = ucol_open("root", &status); 1698 /* tailored locales */ 1699 char locale[][11] = {"fr_FR", "ko_KR", "sh_YU", "th_TH", "zh_CN", "zh__PINYIN"}; 1700 const char *loc; 1701 FileStream *file = NULL; 1702 char line[2048]; 1703 UChar codepoints[11]; 1704 int count = 0; 1705 int maxCount = 0; 1706 UChar contextCPs[3]; 1707 UChar32 c; 1708 UParseError parseError; 1709 if (U_FAILURE(status)) { 1710 log_err_status(status, "en_US collator creation failed -> %s\n", u_errorName(status)); 1711 return; 1712 } 1713 log_verbose("Testing UCA elements\n"); 1714 file = getFractionalUCA(); 1715 if (file == NULL) { 1716 log_err("Fractional UCA data can not be opened\n"); 1717 return; 1718 } 1719 1720 while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) { 1721 if(line[0] == 0 || line[0] == '#' || line[0] == '\n' || 1722 line[0] == 0x000D || line[0] == '[') { 1723 continue; 1724 } 1725 1726 getCodePoints(line, codepoints, contextCPs); 1727 checkCEValidity(coll, codepoints, u_strlen(codepoints)); 1728 } 1729 1730 log_verbose("Testing UCA elements for the whole range of unicode characters\n"); 1731 for (c = 0; c <= 0xffff; ++c) { 1732 if (u_isdefined(c)) { 1733 codepoints[0] = (UChar)c; 1734 checkCEValidity(coll, codepoints, 1); 1735 } 1736 } 1737 for (; c <= 0x10ffff; ++c) { 1738 if (u_isdefined(c)) { 1739 int32_t i = 0; 1740 U16_APPEND_UNSAFE(codepoints, i, c); 1741 checkCEValidity(coll, codepoints, i); 1742 } 1743 } 1744 1745 ucol_close(coll); 1746 1747 /* testing tailored collation elements */ 1748 log_verbose("Testing tailored elements\n"); 1749 if(getTestOption(QUICK_OPTION)) { 1750 maxCount = sizeof(locale)/sizeof(locale[0]); 1751 } else { 1752 maxCount = uloc_countAvailable(); 1753 } 1754 while (count < maxCount) { 1755 const UChar *rules = NULL, 1756 *current = NULL; 1757 UChar *rulesCopy = NULL; 1758 int32_t ruleLen = 0; 1759 1760 uint32_t chOffset = 0; 1761 uint32_t chLen = 0; 1762 uint32_t exOffset = 0; 1763 uint32_t exLen = 0; 1764 uint32_t prefixOffset = 0; 1765 uint32_t prefixLen = 0; 1766 UBool startOfRules = TRUE; 1767 UColOptionSet opts; 1768 1769 UColTokenParser src; 1770 uint32_t strength = 0; 1771 uint16_t specs = 0; 1772 1773 (void)specs; /* Suppress set but not used warnings. */ 1774 (void)strength; 1775 (void)prefixLen; 1776 (void)prefixOffset; 1777 (void)exLen; 1778 (void)exOffset; 1779 1780 if(getTestOption(QUICK_OPTION)) { 1781 loc = locale[count]; 1782 } else { 1783 loc = uloc_getAvailable(count); 1784 if(!hasCollationElements(loc)) { 1785 count++; 1786 continue; 1787 } 1788 } 1789 status = U_ZERO_ERROR; // clear status from previous loop iteration 1790 1791 uprv_memset(&src, 0, sizeof(UColTokenParser)); 1792 1793 log_verbose("Testing CEs for %s\n", loc); 1794 1795 coll = ucol_open(loc, &status); 1796 if (U_FAILURE(status)) { 1797 log_err("%s collator creation failed with status %s\n", loc, u_errorName(status)); 1798 return; 1799 } 1800 1801 src.opts = &opts; 1802 rules = ucol_getRules(coll, &ruleLen); 1803 1804 /* 1805 * We have not set up the UColTokenParser with a callback function 1806 * to fetch [import] sub-rules, 1807 * so skip testing tailorings that import others. 1808 * TODO: Ticket #8047: Change TestCEValidity to use ucol_getTailoredSet() 1809 * rather than the internal collation rule parser 1810 */ 1811 if (ruleLen > 0 && u_strstr(rules, IMPORT) == NULL) { 1812 rulesCopy = (UChar *)uprv_malloc((ruleLen + 1813 UCOL_TOK_EXTRA_RULE_SPACE_SIZE) * sizeof(UChar)); 1814 uprv_memcpy(rulesCopy, rules, ruleLen * sizeof(UChar)); 1815 src.current = src.source = rulesCopy; 1816 src.end = rulesCopy + ruleLen; 1817 src.extraCurrent = src.end; 1818 src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE; 1819 1820 /* Note that as a result of tickets 7015 or 6912, ucol_tok_parseNextToken can cause the pointer to 1821 the rules copy in src.source to get reallocated, freeing the original pointer in rulesCopy */ 1822 while ((current = ucol_tok_parseNextToken(&src, startOfRules, &parseError,&status)) != NULL && U_SUCCESS(status)) { 1823 strength = src.parsedToken.strength; 1824 chOffset = src.parsedToken.charsOffset; 1825 chLen = src.parsedToken.charsLen; 1826 exOffset = src.parsedToken.extensionOffset; 1827 exLen = src.parsedToken.extensionLen; 1828 prefixOffset = src.parsedToken.prefixOffset; 1829 prefixLen = src.parsedToken.prefixLen; 1830 specs = src.parsedToken.flags; 1831 1832 startOfRules = FALSE; 1833 uprv_memcpy(codepoints, src.source + chOffset, 1834 chLen * sizeof(UChar)); 1835 codepoints[chLen] = 0; 1836 checkCEValidity(coll, codepoints, chLen); 1837 } 1838 if (U_FAILURE(status)) { 1839 log_err("%s collator, ucol_tok_parseNextToken failed with status %s\n", loc, u_errorName(status)); 1840 } 1841 uprv_free(src.source); 1842 uprv_free(src.reorderCodes); 1843 } 1844 1845 ucol_close(coll); 1846 count ++; 1847 } 1848 T_FileStream_close(file); 1849 } 1850 1851 static void printSortKeyError(const UChar *codepoints, int length, 1852 uint8_t *sortkey, int sklen) 1853 { 1854 int count = 0; 1855 log_err("Sortkey not valid for "); 1856 while (length > 0) { 1857 log_err("0x%04x ", *codepoints); 1858 length --; 1859 codepoints ++; 1860 } 1861 log_err("\nSortkey : "); 1862 while (count < sklen) { 1863 log_err("0x%02x ", sortkey[count]); 1864 count ++; 1865 } 1866 log_err("\n"); 1867 } 1868 1869 /** 1870 * Checking sort key validity for all levels 1871 */ 1872 static UBool checkSortKeyValidity(UCollator *coll, 1873 const UChar *codepoints, 1874 int length) 1875 { 1876 UErrorCode status = U_ZERO_ERROR; 1877 UCollationStrength strength[5] = {UCOL_PRIMARY, UCOL_SECONDARY, 1878 UCOL_TERTIARY, UCOL_QUATERNARY, 1879 UCOL_IDENTICAL}; 1880 int strengthlen = 5; 1881 int strengthIndex = 0; 1882 int caselevel = 0; 1883 1884 while (caselevel < 1) { 1885 if (caselevel == 0) { 1886 ucol_setAttribute(coll, UCOL_CASE_LEVEL, UCOL_OFF, &status); 1887 } 1888 else { 1889 ucol_setAttribute(coll, UCOL_CASE_LEVEL, UCOL_ON, &status); 1890 } 1891 1892 while (strengthIndex < strengthlen) { 1893 int count01 = 0; 1894 uint32_t count = 0; 1895 uint8_t sortkey[128]; 1896 uint32_t sklen; 1897 1898 ucol_setStrength(coll, strength[strengthIndex]); 1899 sklen = ucol_getSortKey(coll, codepoints, length, sortkey, 128); 1900 while (sortkey[count] != 0) { 1901 if (sortkey[count] == 2 || (sortkey[count] == 3 && count01 > 0 && strengthIndex != 4)) { 1902 printSortKeyError(codepoints, length, sortkey, sklen); 1903 return FALSE; 1904 } 1905 if (sortkey[count] == 1) { 1906 count01 ++; 1907 } 1908 count ++; 1909 } 1910 1911 if (count + 1 != sklen || (count01 != strengthIndex + caselevel)) { 1912 printSortKeyError(codepoints, length, sortkey, sklen); 1913 return FALSE; 1914 } 1915 strengthIndex ++; 1916 } 1917 caselevel ++; 1918 } 1919 return TRUE; 1920 } 1921 1922 static void TestSortKeyValidity(void) 1923 { 1924 /* testing UCA collation elements */ 1925 UErrorCode status = U_ZERO_ERROR; 1926 /* en_US has no tailorings */ 1927 UCollator *coll = ucol_open("en_US", &status); 1928 /* tailored locales */ 1929 char locale[][6] = {"fr_FR", "ko_KR", "sh_YU", "th_TH", "zh_CN"}; 1930 FileStream *file = NULL; 1931 char line[2048]; 1932 UChar codepoints[10]; 1933 int count = 0; 1934 UChar contextCPs[5]; 1935 UParseError parseError; 1936 if (U_FAILURE(status)) { 1937 log_err_status(status, "en_US collator creation failed -> %s\n", u_errorName(status)); 1938 return; 1939 } 1940 log_verbose("Testing UCA elements\n"); 1941 file = getFractionalUCA(); 1942 if (file == NULL) { 1943 log_err("Fractional UCA data can not be opened\n"); 1944 return; 1945 } 1946 1947 while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) { 1948 if(line[0] == 0 || line[0] == '#' || line[0] == '\n' || 1949 line[0] == 0x000D || line[0] == '[') { 1950 continue; 1951 } 1952 1953 getCodePoints(line, codepoints, contextCPs); 1954 if(codepoints[0] == 0xFFFE) { 1955 /* Skip special merge-sort character U+FFFE which has otherwise illegal 02 weight bytes. */ 1956 continue; 1957 } 1958 checkSortKeyValidity(coll, codepoints, u_strlen(codepoints)); 1959 } 1960 1961 log_verbose("Testing UCA elements for the whole range of unicode characters\n"); 1962 codepoints[0] = 0; 1963 1964 while (codepoints[0] < 0xFFFF) { 1965 if (u_isdefined((UChar32)codepoints[0])) { 1966 checkSortKeyValidity(coll, codepoints, 1); 1967 } 1968 codepoints[0] ++; 1969 } 1970 1971 ucol_close(coll); 1972 1973 /* testing tailored collation elements */ 1974 log_verbose("Testing tailored elements\n"); 1975 while (count < 5) { 1976 const UChar *rules = NULL, 1977 *current = NULL; 1978 UChar *rulesCopy = NULL; 1979 int32_t ruleLen = 0; 1980 1981 uint32_t chOffset = 0; 1982 uint32_t chLen = 0; 1983 uint32_t exOffset = 0; 1984 uint32_t exLen = 0; 1985 uint32_t prefixOffset = 0; 1986 uint32_t prefixLen = 0; 1987 UBool startOfRules = TRUE; 1988 UColOptionSet opts; 1989 1990 UColTokenParser src; 1991 uint32_t strength = 0; 1992 uint16_t specs = 0; 1993 status = U_ZERO_ERROR; // clear status from previous loop iteration 1994 1995 (void)specs; 1996 (void)strength; 1997 (void)prefixLen; 1998 (void)prefixOffset; 1999 (void)exLen; 2000 (void)exOffset; 2001 2002 uprv_memset(&src, 0, sizeof(UColTokenParser)); 2003 2004 coll = ucol_open(locale[count], &status); 2005 if (U_FAILURE(status)) { 2006 log_err("%s collator creation failed with status %s\n", locale[count], u_errorName(status)); 2007 return; 2008 } 2009 2010 src.opts = &opts; 2011 rules = ucol_getRules(coll, &ruleLen); 2012 2013 /* 2014 * We have not set up the UColTokenParser with a callback function 2015 * to fetch [import] sub-rules, 2016 * so skip testing tailorings that import others. 2017 * TODO: Ticket #8047: Change TestSortKeyValidity to use ucol_getTailoredSet() 2018 * rather than the internal collation rule parser 2019 */ 2020 if (ruleLen > 0 && u_strstr(rules, IMPORT) == NULL) { 2021 rulesCopy = (UChar *)uprv_malloc((ruleLen + 2022 UCOL_TOK_EXTRA_RULE_SPACE_SIZE) * sizeof(UChar)); 2023 uprv_memcpy(rulesCopy, rules, ruleLen * sizeof(UChar)); 2024 src.current = src.source = rulesCopy; 2025 src.end = rulesCopy + ruleLen; 2026 src.extraCurrent = src.end; 2027 src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE; 2028 2029 /* Note that as a result of tickets 7015 or 6912, ucol_tok_parseNextToken can cause the pointer to 2030 the rules copy in src.source to get reallocated, freeing the original pointer in rulesCopy */ 2031 while ((current = ucol_tok_parseNextToken(&src, startOfRules,&parseError, &status)) != NULL && U_SUCCESS(status)) { 2032 strength = src.parsedToken.strength; 2033 chOffset = src.parsedToken.charsOffset; 2034 chLen = src.parsedToken.charsLen; 2035 exOffset = src.parsedToken.extensionOffset; 2036 exLen = src.parsedToken.extensionLen; 2037 prefixOffset = src.parsedToken.prefixOffset; 2038 prefixLen = src.parsedToken.prefixLen; 2039 specs = src.parsedToken.flags; 2040 2041 startOfRules = FALSE; 2042 uprv_memcpy(codepoints, src.source + chOffset, 2043 chLen * sizeof(UChar)); 2044 codepoints[chLen] = 0; 2045 if(codepoints[0] == 0xFFFE) { 2046 /* Skip special merge-sort character U+FFFE which has otherwise illegal 02 weight bytes. */ 2047 continue; 2048 } 2049 checkSortKeyValidity(coll, codepoints, chLen); 2050 } 2051 if (U_FAILURE(status)) { 2052 log_err("%s collator, ucol_tok_parseNextToken failed with status %s\n", locale[count], u_errorName(status)); 2053 } 2054 uprv_free(src.source); 2055 uprv_free(src.reorderCodes); 2056 } 2057 2058 ucol_close(coll); 2059 count ++; 2060 } 2061 T_FileStream_close(file); 2062 } 2063 2064 /** 2065 * TestSearchCollatorElements tests iterator behavior (forwards and backwards) with 2066 * normalization on AND jamo tailoring, among other things. 2067 */ 2068 static const UChar tsceText[] = { /* Nothing in here should be ignorable */ 2069 0x0020, 0xAC00, /* simple LV Hangul */ 2070 0x0020, 0xAC01, /* simple LVT Hangul */ 2071 0x0020, 0xAC0F, /* LVTT, last jamo expands for search */ 2072 0x0020, 0xAFFF, /* LLVVVTT, every jamo expands for search */ 2073 0x0020, 0x1100, 0x1161, 0x11A8, /* 0xAC01 as conjoining jamo */ 2074 0x0020, 0x3131, 0x314F, 0x3131, /* 0xAC01 as compatibility jamo */ 2075 0x0020, 0x1100, 0x1161, 0x11B6, /* 0xAC0F as conjoining jamo; last expands for search */ 2076 0x0020, 0x1101, 0x1170, 0x11B6, /* 0xAFFF as conjoining jamo; all expand for search */ 2077 0x0020, 0x00E6, /* small letter ae, expands */ 2078 0x0020, 0x1E4D, /* small letter o with tilde and acute, decomposes */ 2079 0x0020 2080 }; 2081 enum { kLen_tsceText = sizeof(tsceText)/sizeof(tsceText[0]) }; 2082 2083 static const int32_t rootStandardOffsets[] = { 2084 0, 1,2, 2085 2, 3,4,4, 2086 4, 5,6,6, 2087 6, 7,8,8, 2088 8, 9,10,11, 2089 12, 13,14,15, 2090 16, 17,18,19, 2091 20, 21,22,23, 2092 24, 25,26,26,26, 2093 26, 27,28,28, 2094 28, 2095 29 2096 }; 2097 enum { kLen_rootStandardOffsets = sizeof(rootStandardOffsets)/sizeof(rootStandardOffsets[0]) }; 2098 2099 static const int32_t rootSearchOffsets[] = { 2100 0, 1,2, 2101 2, 3,4,4, 2102 4, 5,6,6,6, 2103 6, 7,8,8,8,8,8,8, 2104 8, 9,10,11, 2105 12, 13,14,15, 2106 16, 17,18,19,20, 2107 20, 21,22,22,23,23,23,24, 2108 24, 25,26,26,26, 2109 26, 27,28,28, 2110 28, 2111 29 2112 }; 2113 enum { kLen_rootSearchOffsets = sizeof(rootSearchOffsets)/sizeof(rootSearchOffsets[0]) }; 2114 2115 typedef struct { 2116 const char * locale; 2117 const int32_t * offsets; 2118 int32_t offsetsLen; 2119 } TSCEItem; 2120 2121 static const TSCEItem tsceItems[] = { 2122 { "root", rootStandardOffsets, kLen_rootStandardOffsets }, 2123 { "root@collation=search", rootSearchOffsets, kLen_rootSearchOffsets }, 2124 { NULL, NULL, 0 } 2125 }; 2126 2127 static void TestSearchCollatorElements(void) 2128 { 2129 const TSCEItem * tsceItemPtr; 2130 for (tsceItemPtr = tsceItems; tsceItemPtr->locale != NULL; tsceItemPtr++) { 2131 UErrorCode status = U_ZERO_ERROR; 2132 UCollator* ucol = ucol_open(tsceItemPtr->locale, &status); 2133 if ( U_SUCCESS(status) ) { 2134 UCollationElements * uce = ucol_openElements(ucol, tsceText, kLen_tsceText, &status); 2135 if ( U_SUCCESS(status) ) { 2136 int32_t offset, element; 2137 const int32_t * nextOffsetPtr; 2138 const int32_t * limitOffsetPtr; 2139 2140 nextOffsetPtr = tsceItemPtr->offsets; 2141 limitOffsetPtr = tsceItemPtr->offsets + tsceItemPtr->offsetsLen; 2142 do { 2143 offset = ucol_getOffset(uce); 2144 element = ucol_next(uce, &status); 2145 if ( element == 0 ) { 2146 log_err("error, locale %s, ucol_next returned element 0\n", tsceItemPtr->locale ); 2147 } 2148 if ( nextOffsetPtr < limitOffsetPtr ) { 2149 if (offset != *nextOffsetPtr) { 2150 log_err("error, locale %s, expected ucol_next -> ucol_getOffset %d, got %d\n", 2151 tsceItemPtr->locale, *nextOffsetPtr, offset ); 2152 nextOffsetPtr = limitOffsetPtr; 2153 break; 2154 } 2155 nextOffsetPtr++; 2156 } else { 2157 log_err("error, locale %s, ucol_next returned more elements than expected\n", tsceItemPtr->locale ); 2158 } 2159 } while ( U_SUCCESS(status) && element != UCOL_NULLORDER ); 2160 if ( nextOffsetPtr < limitOffsetPtr ) { 2161 log_err("error, locale %s, ucol_next returned fewer elements than expected\n", tsceItemPtr->locale ); 2162 } 2163 2164 ucol_setOffset(uce, kLen_tsceText, &status); 2165 status = U_ZERO_ERROR; 2166 nextOffsetPtr = tsceItemPtr->offsets + tsceItemPtr->offsetsLen; 2167 limitOffsetPtr = tsceItemPtr->offsets; 2168 do { 2169 offset = ucol_getOffset(uce); 2170 element = ucol_previous(uce, &status); 2171 if ( element == 0 ) { 2172 log_err("error, locale %s, ucol_previous returned element 0\n", tsceItemPtr->locale ); 2173 } 2174 if ( nextOffsetPtr > limitOffsetPtr ) { 2175 nextOffsetPtr--; 2176 if (offset != *nextOffsetPtr) { 2177 log_err("error, locale %s, expected ucol_previous -> ucol_getOffset %d, got %d\n", 2178 tsceItemPtr->locale, *nextOffsetPtr, offset ); 2179 nextOffsetPtr = limitOffsetPtr; 2180 break; 2181 } 2182 } else { 2183 log_err("error, locale %s, ucol_previous returned more elements than expected\n", tsceItemPtr->locale ); 2184 } 2185 } while ( U_SUCCESS(status) && element != UCOL_NULLORDER ); 2186 if ( nextOffsetPtr > limitOffsetPtr ) { 2187 log_err("error, locale %s, ucol_previous returned fewer elements than expected\n", tsceItemPtr->locale ); 2188 } 2189 2190 ucol_closeElements(uce); 2191 } else { 2192 log_err("error, locale %s, ucol_openElements failed: %s\n", tsceItemPtr->locale, u_errorName(status) ); 2193 } 2194 ucol_close(ucol); 2195 } else { 2196 log_data_err("error, locale %s, ucol_open failed: %s\n", tsceItemPtr->locale, u_errorName(status) ); 2197 } 2198 } 2199 } 2200 2201 #endif /* #if !UCONFIG_NO_COLLATION */ 2202