1 /******************************************************************** 2 * COPYRIGHT: 3 * Copyright (c) 1997-2013, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ********************************************************************/ 6 /******************************************************************************** 7 * 8 * File CITERTST.C 9 * 10 * Modification History: 11 * Date Name Description 12 * Madhu Katragadda Ported for C API 13 * 02/19/01 synwee Modified test case for new collation iterator 14 *********************************************************************************/ 15 /* 16 * Collation Iterator tests. 17 * (Let me reiterate my position...) 18 */ 19 20 #include "unicode/utypes.h" 21 22 #if !UCONFIG_NO_COLLATION 23 24 #include "unicode/ucol.h" 25 #include "unicode/ucoleitr.h" 26 #include "unicode/uloc.h" 27 #include "unicode/uchar.h" 28 #include "unicode/ustring.h" 29 #include "unicode/putil.h" 30 #include "callcoll.h" 31 #include "cmemory.h" 32 #include "cintltst.h" 33 #include "citertst.h" 34 #include "ccolltst.h" 35 #include "filestrm.h" 36 #include "cstring.h" 37 #include "ucol_imp.h" 38 #include "ucol_tok.h" 39 #include "uparse.h" 40 #include <stdio.h> 41 42 extern uint8_t ucol_uprv_getCaseBits(const UChar *, uint32_t, UErrorCode *); 43 44 void addCollIterTest(TestNode** root) 45 { 46 addTest(root, &TestPrevious, "tscoll/citertst/TestPrevious"); 47 addTest(root, &TestOffset, "tscoll/citertst/TestOffset"); 48 addTest(root, &TestSetText, "tscoll/citertst/TestSetText"); 49 addTest(root, &TestMaxExpansion, "tscoll/citertst/TestMaxExpansion"); 50 addTest(root, &TestUnicodeChar, "tscoll/citertst/TestUnicodeChar"); 51 addTest(root, &TestNormalizedUnicodeChar, 52 "tscoll/citertst/TestNormalizedUnicodeChar"); 53 addTest(root, &TestNormalization, "tscoll/citertst/TestNormalization"); 54 addTest(root, &TestBug672, "tscoll/citertst/TestBug672"); 55 addTest(root, &TestBug672Normalize, "tscoll/citertst/TestBug672Normalize"); 56 addTest(root, &TestSmallBuffer, "tscoll/citertst/TestSmallBuffer"); 57 addTest(root, &TestCEs, "tscoll/citertst/TestCEs"); 58 addTest(root, &TestDiscontiguos, "tscoll/citertst/TestDiscontiguos"); 59 addTest(root, &TestCEBufferOverflow, "tscoll/citertst/TestCEBufferOverflow"); 60 addTest(root, &TestCEValidity, "tscoll/citertst/TestCEValidity"); 61 addTest(root, &TestSortKeyValidity, "tscoll/citertst/TestSortKeyValidity"); 62 addTest(root, &TestSearchCollatorElements, "tscoll/citertst/TestSearchCollatorElements"); 63 } 64 65 /* The locales we support */ 66 67 static const char * LOCALES[] = {"en_AU", "en_BE", "en_CA"}; 68 69 static void TestBug672() { 70 UErrorCode status = U_ZERO_ERROR; 71 UChar pattern[20]; 72 UChar text[50]; 73 int i; 74 int result[3][3]; 75 76 u_uastrcpy(pattern, "resume"); 77 u_uastrcpy(text, "Time to resume updating my resume."); 78 79 for (i = 0; i < 3; ++ i) { 80 UCollator *coll = ucol_open(LOCALES[i], &status); 81 UCollationElements *pitr = ucol_openElements(coll, pattern, -1, 82 &status); 83 UCollationElements *titer = ucol_openElements(coll, text, -1, 84 &status); 85 if (U_FAILURE(status)) { 86 log_err_status(status, "ERROR: in creation of either the collator or the collation iterator :%s\n", 87 myErrorName(status)); 88 return; 89 } 90 91 log_verbose("locale tested %s\n", LOCALES[i]); 92 93 while (ucol_next(pitr, &status) != UCOL_NULLORDER && 94 U_SUCCESS(status)) { 95 } 96 if (U_FAILURE(status)) { 97 log_err("ERROR: reversing collation iterator :%s\n", 98 myErrorName(status)); 99 return; 100 } 101 ucol_reset(pitr); 102 103 ucol_setOffset(titer, u_strlen(pattern), &status); 104 if (U_FAILURE(status)) { 105 log_err("ERROR: setting offset in collator :%s\n", 106 myErrorName(status)); 107 return; 108 } 109 result[i][0] = ucol_getOffset(titer); 110 log_verbose("Text iterator set to offset %d\n", result[i][0]); 111 112 /* Use previous() */ 113 ucol_previous(titer, &status); 114 result[i][1] = ucol_getOffset(titer); 115 log_verbose("Current offset %d after previous\n", result[i][1]); 116 117 /* Add one to index */ 118 log_verbose("Adding one to current offset...\n"); 119 ucol_setOffset(titer, ucol_getOffset(titer) + 1, &status); 120 if (U_FAILURE(status)) { 121 log_err("ERROR: setting offset in collator :%s\n", 122 myErrorName(status)); 123 return; 124 } 125 result[i][2] = ucol_getOffset(titer); 126 log_verbose("Current offset in text = %d\n", result[i][2]); 127 ucol_closeElements(pitr); 128 ucol_closeElements(titer); 129 ucol_close(coll); 130 } 131 132 if (uprv_memcmp(result[0], result[1], 3) != 0 || 133 uprv_memcmp(result[1], result[2], 3) != 0) { 134 log_err("ERROR: Different locales have different offsets at the same character\n"); 135 } 136 } 137 138 139 140 /* Running this test with normalization enabled showed up a bug in the incremental 141 normalization code. */ 142 static void TestBug672Normalize() { 143 UErrorCode status = U_ZERO_ERROR; 144 UChar pattern[20]; 145 UChar text[50]; 146 int i; 147 int result[3][3]; 148 149 u_uastrcpy(pattern, "resume"); 150 u_uastrcpy(text, "Time to resume updating my resume."); 151 152 for (i = 0; i < 3; ++ i) { 153 UCollator *coll = ucol_open(LOCALES[i], &status); 154 UCollationElements *pitr = NULL; 155 UCollationElements *titer = NULL; 156 157 ucol_setAttribute(coll, UCOL_NORMALIZATION_MODE, UCOL_ON, &status); 158 159 pitr = ucol_openElements(coll, pattern, -1, &status); 160 titer = ucol_openElements(coll, text, -1, &status); 161 if (U_FAILURE(status)) { 162 log_err_status(status, "ERROR: in creation of either the collator or the collation iterator :%s\n", 163 myErrorName(status)); 164 return; 165 } 166 167 log_verbose("locale tested %s\n", LOCALES[i]); 168 169 while (ucol_next(pitr, &status) != UCOL_NULLORDER && 170 U_SUCCESS(status)) { 171 } 172 if (U_FAILURE(status)) { 173 log_err("ERROR: reversing collation iterator :%s\n", 174 myErrorName(status)); 175 return; 176 } 177 ucol_reset(pitr); 178 179 ucol_setOffset(titer, u_strlen(pattern), &status); 180 if (U_FAILURE(status)) { 181 log_err("ERROR: setting offset in collator :%s\n", 182 myErrorName(status)); 183 return; 184 } 185 result[i][0] = ucol_getOffset(titer); 186 log_verbose("Text iterator set to offset %d\n", result[i][0]); 187 188 /* Use previous() */ 189 ucol_previous(titer, &status); 190 result[i][1] = ucol_getOffset(titer); 191 log_verbose("Current offset %d after previous\n", result[i][1]); 192 193 /* Add one to index */ 194 log_verbose("Adding one to current offset...\n"); 195 ucol_setOffset(titer, ucol_getOffset(titer) + 1, &status); 196 if (U_FAILURE(status)) { 197 log_err("ERROR: setting offset in collator :%s\n", 198 myErrorName(status)); 199 return; 200 } 201 result[i][2] = ucol_getOffset(titer); 202 log_verbose("Current offset in text = %d\n", result[i][2]); 203 ucol_closeElements(pitr); 204 ucol_closeElements(titer); 205 ucol_close(coll); 206 } 207 208 if (uprv_memcmp(result[0], result[1], 3) != 0 || 209 uprv_memcmp(result[1], result[2], 3) != 0) { 210 log_err("ERROR: Different locales have different offsets at the same character\n"); 211 } 212 } 213 214 215 216 217 /** 218 * Test for CollationElementIterator previous and next for the whole set of 219 * unicode characters. 220 */ 221 static void TestUnicodeChar() 222 { 223 UChar source[0x100]; 224 UCollator *en_us; 225 UCollationElements *iter; 226 UErrorCode status = U_ZERO_ERROR; 227 UChar codepoint; 228 229 UChar *test; 230 en_us = ucol_open("en_US", &status); 231 if (U_FAILURE(status)){ 232 log_err_status(status, "ERROR: in creation of collation data using ucol_open()\n %s\n", 233 myErrorName(status)); 234 return; 235 } 236 237 for (codepoint = 1; codepoint < 0xFFFE;) 238 { 239 test = source; 240 241 while (codepoint % 0xFF != 0) 242 { 243 if (u_isdefined(codepoint)) 244 *(test ++) = codepoint; 245 codepoint ++; 246 } 247 248 if (u_isdefined(codepoint)) 249 *(test ++) = codepoint; 250 251 if (codepoint != 0xFFFF) 252 codepoint ++; 253 254 *test = 0; 255 iter=ucol_openElements(en_us, source, u_strlen(source), &status); 256 if(U_FAILURE(status)){ 257 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 258 myErrorName(status)); 259 ucol_close(en_us); 260 return; 261 } 262 /* A basic test to see if it's working at all */ 263 log_verbose("codepoint testing %x\n", codepoint); 264 backAndForth(iter); 265 ucol_closeElements(iter); 266 267 /* null termination test */ 268 iter=ucol_openElements(en_us, source, -1, &status); 269 if(U_FAILURE(status)){ 270 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 271 myErrorName(status)); 272 ucol_close(en_us); 273 return; 274 } 275 /* A basic test to see if it's working at all */ 276 backAndForth(iter); 277 ucol_closeElements(iter); 278 } 279 280 ucol_close(en_us); 281 } 282 283 /** 284 * Test for CollationElementIterator previous and next for the whole set of 285 * unicode characters with normalization on. 286 */ 287 static void TestNormalizedUnicodeChar() 288 { 289 UChar source[0x100]; 290 UCollator *th_th; 291 UCollationElements *iter; 292 UErrorCode status = U_ZERO_ERROR; 293 UChar codepoint; 294 295 UChar *test; 296 /* thai should have normalization on */ 297 th_th = ucol_open("th_TH", &status); 298 if (U_FAILURE(status)){ 299 log_err_status(status, "ERROR: in creation of thai collation using ucol_open()\n %s\n", 300 myErrorName(status)); 301 return; 302 } 303 304 for (codepoint = 1; codepoint < 0xFFFE;) 305 { 306 test = source; 307 308 while (codepoint % 0xFF != 0) 309 { 310 if (u_isdefined(codepoint)) 311 *(test ++) = codepoint; 312 codepoint ++; 313 } 314 315 if (u_isdefined(codepoint)) 316 *(test ++) = codepoint; 317 318 if (codepoint != 0xFFFF) 319 codepoint ++; 320 321 *test = 0; 322 iter=ucol_openElements(th_th, source, u_strlen(source), &status); 323 if(U_FAILURE(status)){ 324 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 325 myErrorName(status)); 326 ucol_close(th_th); 327 return; 328 } 329 330 backAndForth(iter); 331 ucol_closeElements(iter); 332 333 iter=ucol_openElements(th_th, source, -1, &status); 334 if(U_FAILURE(status)){ 335 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 336 myErrorName(status)); 337 ucol_close(th_th); 338 return; 339 } 340 341 backAndForth(iter); 342 ucol_closeElements(iter); 343 } 344 345 ucol_close(th_th); 346 } 347 348 /** 349 * Test the incremental normalization 350 */ 351 static void TestNormalization() 352 { 353 UErrorCode status = U_ZERO_ERROR; 354 const char *str = 355 "&a < \\u0300\\u0315 < A\\u0300\\u0315 < \\u0316\\u0315B < \\u0316\\u0300\\u0315"; 356 UCollator *coll; 357 UChar rule[50]; 358 int rulelen = u_unescape(str, rule, 50); 359 int count = 0; 360 const char *testdata[] = 361 {"\\u1ED9", "o\\u0323\\u0302", 362 "\\u0300\\u0315", "\\u0315\\u0300", 363 "A\\u0300\\u0315B", "A\\u0315\\u0300B", 364 "A\\u0316\\u0315B", "A\\u0315\\u0316B", 365 "\\u0316\\u0300\\u0315", "\\u0315\\u0300\\u0316", 366 "A\\u0316\\u0300\\u0315B", "A\\u0315\\u0300\\u0316B", 367 "\\u0316\\u0315\\u0300", "A\\u0316\\u0315\\u0300B"}; 368 int32_t srclen; 369 UChar source[10]; 370 UCollationElements *iter; 371 372 coll = ucol_openRules(rule, rulelen, UCOL_ON, UCOL_TERTIARY, NULL, &status); 373 ucol_setAttribute(coll, UCOL_NORMALIZATION_MODE, UCOL_ON, &status); 374 if (U_FAILURE(status)){ 375 log_err_status(status, "ERROR: in creation of collator using ucol_openRules()\n %s\n", 376 myErrorName(status)); 377 return; 378 } 379 380 srclen = u_unescape(testdata[0], source, 10); 381 iter = ucol_openElements(coll, source, srclen, &status); 382 backAndForth(iter); 383 ucol_closeElements(iter); 384 385 srclen = u_unescape(testdata[1], source, 10); 386 iter = ucol_openElements(coll, source, srclen, &status); 387 backAndForth(iter); 388 ucol_closeElements(iter); 389 390 while (count < 12) { 391 srclen = u_unescape(testdata[count], source, 10); 392 iter = ucol_openElements(coll, source, srclen, &status); 393 394 if (U_FAILURE(status)){ 395 log_err("ERROR: in creation of collator element iterator\n %s\n", 396 myErrorName(status)); 397 return; 398 } 399 backAndForth(iter); 400 ucol_closeElements(iter); 401 402 iter = ucol_openElements(coll, source, -1, &status); 403 404 if (U_FAILURE(status)){ 405 log_err("ERROR: in creation of collator element iterator\n %s\n", 406 myErrorName(status)); 407 return; 408 } 409 backAndForth(iter); 410 ucol_closeElements(iter); 411 count ++; 412 } 413 ucol_close(coll); 414 } 415 416 /** 417 * Test for CollationElementIterator.previous() 418 * 419 * @bug 4108758 - Make sure it works with contracting characters 420 * 421 */ 422 static void TestPrevious() 423 { 424 UCollator *coll=NULL; 425 UChar rule[50]; 426 UChar *source; 427 UCollator *c1, *c2, *c3; 428 UCollationElements *iter; 429 UErrorCode status = U_ZERO_ERROR; 430 UChar test1[50]; 431 UChar test2[50]; 432 433 u_uastrcpy(test1, "What subset of all possible test cases?"); 434 u_uastrcpy(test2, "has the highest probability of detecting"); 435 coll = ucol_open("en_US", &status); 436 437 iter=ucol_openElements(coll, test1, u_strlen(test1), &status); 438 log_verbose("English locale testing back and forth\n"); 439 if(U_FAILURE(status)){ 440 log_err_status(status, "ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 441 myErrorName(status)); 442 ucol_close(coll); 443 return; 444 } 445 /* A basic test to see if it's working at all */ 446 backAndForth(iter); 447 ucol_closeElements(iter); 448 ucol_close(coll); 449 450 /* Test with a contracting character sequence */ 451 u_uastrcpy(rule, "&a,A < b,B < c,C, d,D < z,Z < ch,cH,Ch,CH"); 452 c1 = ucol_openRules(rule, u_strlen(rule), UCOL_OFF, UCOL_DEFAULT_STRENGTH, NULL, &status); 453 454 log_verbose("Contraction rule testing back and forth with no normalization\n"); 455 456 if (c1 == NULL || U_FAILURE(status)) 457 { 458 log_err("Couldn't create a RuleBasedCollator with a contracting sequence\n %s\n", 459 myErrorName(status)); 460 return; 461 } 462 source=(UChar*)malloc(sizeof(UChar) * 20); 463 u_uastrcpy(source, "abchdcba"); 464 iter=ucol_openElements(c1, source, u_strlen(source), &status); 465 if(U_FAILURE(status)){ 466 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 467 myErrorName(status)); 468 return; 469 } 470 backAndForth(iter); 471 ucol_closeElements(iter); 472 ucol_close(c1); 473 474 /* Test with an expanding character sequence */ 475 u_uastrcpy(rule, "&a < b < c/abd < d"); 476 c2 = ucol_openRules(rule, u_strlen(rule), UCOL_OFF, UCOL_DEFAULT_STRENGTH, NULL, &status); 477 log_verbose("Expansion rule testing back and forth with no normalization\n"); 478 if (c2 == NULL || U_FAILURE(status)) 479 { 480 log_err("Couldn't create a RuleBasedCollator with a contracting sequence.\n %s\n", 481 myErrorName(status)); 482 return; 483 } 484 u_uastrcpy(source, "abcd"); 485 iter=ucol_openElements(c2, source, u_strlen(source), &status); 486 if(U_FAILURE(status)){ 487 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 488 myErrorName(status)); 489 return; 490 } 491 backAndForth(iter); 492 ucol_closeElements(iter); 493 ucol_close(c2); 494 /* Now try both */ 495 u_uastrcpy(rule, "&a < b < c/aba < d < z < ch"); 496 c3 = ucol_openRules(rule, u_strlen(rule), UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH,NULL, &status); 497 log_verbose("Expansion/contraction rule testing back and forth with no normalization\n"); 498 499 if (c3 == NULL || U_FAILURE(status)) 500 { 501 log_err("Couldn't create a RuleBasedCollator with a contracting sequence.\n %s\n", 502 myErrorName(status)); 503 return; 504 } 505 u_uastrcpy(source, "abcdbchdc"); 506 iter=ucol_openElements(c3, source, u_strlen(source), &status); 507 if(U_FAILURE(status)){ 508 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 509 myErrorName(status)); 510 return; 511 } 512 backAndForth(iter); 513 ucol_closeElements(iter); 514 ucol_close(c3); 515 source[0] = 0x0e41; 516 source[1] = 0x0e02; 517 source[2] = 0x0e41; 518 source[3] = 0x0e02; 519 source[4] = 0x0e27; 520 source[5] = 0x61; 521 source[6] = 0x62; 522 source[7] = 0x63; 523 source[8] = 0; 524 525 coll = ucol_open("th_TH", &status); 526 log_verbose("Thai locale testing back and forth with normalization\n"); 527 iter=ucol_openElements(coll, source, u_strlen(source), &status); 528 if(U_FAILURE(status)){ 529 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 530 myErrorName(status)); 531 return; 532 } 533 backAndForth(iter); 534 ucol_closeElements(iter); 535 ucol_close(coll); 536 537 /* prev test */ 538 source[0] = 0x0061; 539 source[1] = 0x30CF; 540 source[2] = 0x3099; 541 source[3] = 0x30FC; 542 source[4] = 0; 543 544 coll = ucol_open("ja_JP", &status); 545 log_verbose("Japanese locale testing back and forth with normalization\n"); 546 iter=ucol_openElements(coll, source, u_strlen(source), &status); 547 if(U_FAILURE(status)){ 548 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 549 myErrorName(status)); 550 return; 551 } 552 backAndForth(iter); 553 ucol_closeElements(iter); 554 ucol_close(coll); 555 556 free(source); 557 } 558 559 /** 560 * Test for getOffset() and setOffset() 561 */ 562 static void TestOffset() 563 { 564 UErrorCode status= U_ZERO_ERROR; 565 UCollator *en_us=NULL; 566 UCollationElements *iter, *pristine; 567 int32_t offset; 568 OrderAndOffset *orders; 569 int32_t orderLength=0; 570 int count = 0; 571 UChar test1[50]; 572 UChar test2[50]; 573 574 u_uastrcpy(test1, "What subset of all possible test cases?"); 575 u_uastrcpy(test2, "has the highest probability of detecting"); 576 en_us = ucol_open("en_US", &status); 577 log_verbose("Testing getOffset and setOffset for collations\n"); 578 iter = ucol_openElements(en_us, test1, u_strlen(test1), &status); 579 if(U_FAILURE(status)){ 580 log_err_status(status, "ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 581 myErrorName(status)); 582 ucol_close(en_us); 583 return; 584 } 585 586 /* testing boundaries */ 587 ucol_setOffset(iter, 0, &status); 588 if (U_FAILURE(status) || ucol_previous(iter, &status) != UCOL_NULLORDER) { 589 log_err("Error: After setting offset to 0, we should be at the end " 590 "of the backwards iteration"); 591 } 592 ucol_setOffset(iter, u_strlen(test1), &status); 593 if (U_FAILURE(status) || ucol_next(iter, &status) != UCOL_NULLORDER) { 594 log_err("Error: After setting offset to end of the string, we should " 595 "be at the end of the backwards iteration"); 596 } 597 598 /* Run all the way through the iterator, then get the offset */ 599 600 orders = getOrders(iter, &orderLength); 601 602 offset = ucol_getOffset(iter); 603 604 if (offset != u_strlen(test1)) 605 { 606 log_err("offset at end != length %d vs %d\n", offset, 607 u_strlen(test1) ); 608 } 609 610 /* Now set the offset back to the beginning and see if it works */ 611 pristine=ucol_openElements(en_us, test1, u_strlen(test1), &status); 612 if(U_FAILURE(status)){ 613 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 614 myErrorName(status)); 615 ucol_close(en_us); 616 return; 617 } 618 status = U_ZERO_ERROR; 619 620 ucol_setOffset(iter, 0, &status); 621 if (U_FAILURE(status)) 622 { 623 log_err("setOffset failed. %s\n", myErrorName(status)); 624 } 625 else 626 { 627 assertEqual(iter, pristine); 628 } 629 630 ucol_closeElements(pristine); 631 ucol_closeElements(iter); 632 free(orders); 633 634 /* testing offsets in normalization buffer */ 635 test1[0] = 0x61; 636 test1[1] = 0x300; 637 test1[2] = 0x316; 638 test1[3] = 0x62; 639 test1[4] = 0; 640 ucol_setAttribute(en_us, UCOL_NORMALIZATION_MODE, UCOL_ON, &status); 641 iter = ucol_openElements(en_us, test1, 4, &status); 642 if(U_FAILURE(status)){ 643 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 644 myErrorName(status)); 645 ucol_close(en_us); 646 return; 647 } 648 649 count = 0; 650 while (ucol_next(iter, &status) != UCOL_NULLORDER && 651 U_SUCCESS(status)) { 652 switch (count) { 653 case 0: 654 if (ucol_getOffset(iter) != 1) { 655 log_err("ERROR: Offset of iteration should be 1\n"); 656 } 657 break; 658 case 3: 659 if (ucol_getOffset(iter) != 4) { 660 log_err("ERROR: Offset of iteration should be 4\n"); 661 } 662 break; 663 default: 664 if (ucol_getOffset(iter) != 3) { 665 log_err("ERROR: Offset of iteration should be 3\n"); 666 } 667 } 668 count ++; 669 } 670 671 ucol_reset(iter); 672 count = 0; 673 while (ucol_previous(iter, &status) != UCOL_NULLORDER && 674 U_SUCCESS(status)) { 675 switch (count) { 676 case 0: 677 case 1: 678 if (ucol_getOffset(iter) != 3) { 679 log_err("ERROR: Offset of iteration should be 3\n"); 680 } 681 break; 682 case 2: 683 if (ucol_getOffset(iter) != 1) { 684 log_err("ERROR: Offset of iteration should be 1\n"); 685 } 686 break; 687 default: 688 if (ucol_getOffset(iter) != 0) { 689 log_err("ERROR: Offset of iteration should be 0\n"); 690 } 691 } 692 count ++; 693 } 694 695 if(U_FAILURE(status)){ 696 log_err("ERROR: in iterating collation elements %s\n", 697 myErrorName(status)); 698 } 699 700 ucol_closeElements(iter); 701 ucol_close(en_us); 702 } 703 704 /** 705 * Test for setText() 706 */ 707 static void TestSetText() 708 { 709 int32_t c,i; 710 UErrorCode status = U_ZERO_ERROR; 711 UCollator *en_us=NULL; 712 UCollationElements *iter1, *iter2; 713 UChar test1[50]; 714 UChar test2[50]; 715 716 u_uastrcpy(test1, "What subset of all possible test cases?"); 717 u_uastrcpy(test2, "has the highest probability of detecting"); 718 en_us = ucol_open("en_US", &status); 719 log_verbose("testing setText for Collation elements\n"); 720 iter1=ucol_openElements(en_us, test1, u_strlen(test1), &status); 721 if(U_FAILURE(status)){ 722 log_err_status(status, "ERROR: in creation of collation element iterator1 using ucol_openElements()\n %s\n", 723 myErrorName(status)); 724 ucol_close(en_us); 725 return; 726 } 727 iter2=ucol_openElements(en_us, test2, u_strlen(test2), &status); 728 if(U_FAILURE(status)){ 729 log_err("ERROR: in creation of collation element iterator2 using ucol_openElements()\n %s\n", 730 myErrorName(status)); 731 ucol_close(en_us); 732 return; 733 } 734 735 /* Run through the second iterator just to exercise it */ 736 c = ucol_next(iter2, &status); 737 i = 0; 738 739 while ( ++i < 10 && (c != UCOL_NULLORDER)) 740 { 741 if (U_FAILURE(status)) 742 { 743 log_err("iter2->next() returned an error. %s\n", myErrorName(status)); 744 ucol_closeElements(iter2); 745 ucol_closeElements(iter1); 746 ucol_close(en_us); 747 return; 748 } 749 750 c = ucol_next(iter2, &status); 751 } 752 753 /* Now set it to point to the same string as the first iterator */ 754 ucol_setText(iter2, test1, u_strlen(test1), &status); 755 if (U_FAILURE(status)) 756 { 757 log_err("call to iter2->setText(test1) failed. %s\n", myErrorName(status)); 758 } 759 else 760 { 761 assertEqual(iter1, iter2); 762 } 763 764 /* Now set it to point to a null string with fake length*/ 765 ucol_setText(iter2, NULL, 2, &status); 766 if (U_FAILURE(status)) 767 { 768 log_err("call to iter2->setText(null) failed. %s\n", myErrorName(status)); 769 } 770 else 771 { 772 if (ucol_next(iter2, &status) != UCOL_NULLORDER) { 773 log_err("iter2 with null text expected to return UCOL_NULLORDER\n"); 774 } 775 } 776 777 ucol_closeElements(iter2); 778 ucol_closeElements(iter1); 779 ucol_close(en_us); 780 } 781 782 /** @bug 4108762 783 * Test for getMaxExpansion() 784 */ 785 static void TestMaxExpansion() 786 { 787 UErrorCode status = U_ZERO_ERROR; 788 UCollator *coll ;/*= ucol_open("en_US", &status);*/ 789 UChar ch = 0; 790 UChar32 unassigned = 0xEFFFD; 791 UChar supplementary[2]; 792 uint32_t stringOffset = 0; 793 UBool isError = FALSE; 794 uint32_t sorder = 0; 795 UCollationElements *iter ;/*= ucol_openElements(coll, &ch, 1, &status);*/ 796 uint32_t temporder = 0; 797 798 UChar rule[256]; 799 u_uastrcpy(rule, "&a < ab < c/aba < d < z < ch"); 800 coll = ucol_openRules(rule, u_strlen(rule), UCOL_DEFAULT, 801 UCOL_DEFAULT_STRENGTH,NULL, &status); 802 if(U_SUCCESS(status) && coll) { 803 iter = ucol_openElements(coll, &ch, 1, &status); 804 805 while (ch < 0xFFFF && U_SUCCESS(status)) { 806 int count = 1; 807 uint32_t order; 808 int32_t size = 0; 809 810 ch ++; 811 812 ucol_setText(iter, &ch, 1, &status); 813 order = ucol_previous(iter, &status); 814 815 /* thai management */ 816 if (order == 0) 817 order = ucol_previous(iter, &status); 818 819 while (U_SUCCESS(status) && 820 ucol_previous(iter, &status) != UCOL_NULLORDER) { 821 count ++; 822 } 823 824 size = ucol_getMaxExpansion(iter, order); 825 if (U_FAILURE(status) || size < count) { 826 log_err("Failure at codepoint %d, maximum expansion count < %d\n", 827 ch, count); 828 } 829 } 830 831 /* testing for exact max expansion */ 832 ch = 0; 833 while (ch < 0x61) { 834 uint32_t order; 835 int32_t size; 836 ucol_setText(iter, &ch, 1, &status); 837 order = ucol_previous(iter, &status); 838 size = ucol_getMaxExpansion(iter, order); 839 if (U_FAILURE(status) || size != 1) { 840 log_err("Failure at codepoint %d, maximum expansion count < %d\n", 841 ch, 1); 842 } 843 ch ++; 844 } 845 846 ch = 0x63; 847 ucol_setText(iter, &ch, 1, &status); 848 temporder = ucol_previous(iter, &status); 849 850 if (U_FAILURE(status) || ucol_getMaxExpansion(iter, temporder) != 3) { 851 log_err("Failure at codepoint %d, maximum expansion count != %d\n", 852 ch, 3); 853 } 854 855 ch = 0x64; 856 ucol_setText(iter, &ch, 1, &status); 857 temporder = ucol_previous(iter, &status); 858 859 if (U_FAILURE(status) || ucol_getMaxExpansion(iter, temporder) != 1) { 860 log_err("Failure at codepoint %d, maximum expansion count != %d\n", 861 ch, 3); 862 } 863 864 U16_APPEND(supplementary, stringOffset, 2, unassigned, isError); 865 ucol_setText(iter, supplementary, 2, &status); 866 sorder = ucol_previous(iter, &status); 867 868 if (U_FAILURE(status) || ucol_getMaxExpansion(iter, sorder) != 2) { 869 log_err("Failure at codepoint %d, maximum expansion count < %d\n", 870 ch, 2); 871 } 872 873 /* testing jamo */ 874 ch = 0x1165; 875 876 ucol_setText(iter, &ch, 1, &status); 877 temporder = ucol_previous(iter, &status); 878 if (U_FAILURE(status) || ucol_getMaxExpansion(iter, temporder) > 3) { 879 log_err("Failure at codepoint %d, maximum expansion count > %d\n", 880 ch, 3); 881 } 882 883 ucol_closeElements(iter); 884 ucol_close(coll); 885 886 /* testing special jamo &a<\u1160 */ 887 rule[0] = 0x26; 888 rule[1] = 0x71; 889 rule[2] = 0x3c; 890 rule[3] = 0x1165; 891 rule[4] = 0x2f; 892 rule[5] = 0x71; 893 rule[6] = 0x71; 894 rule[7] = 0x71; 895 rule[8] = 0x71; 896 rule[9] = 0; 897 898 coll = ucol_openRules(rule, u_strlen(rule), UCOL_DEFAULT, 899 UCOL_DEFAULT_STRENGTH,NULL, &status); 900 iter = ucol_openElements(coll, &ch, 1, &status); 901 902 temporder = ucol_previous(iter, &status); 903 if (U_FAILURE(status) || ucol_getMaxExpansion(iter, temporder) != 6) { 904 log_err("Failure at codepoint %d, maximum expansion count > %d\n", 905 ch, 5); 906 } 907 908 ucol_closeElements(iter); 909 ucol_close(coll); 910 } else { 911 log_err_status(status, "Couldn't open collator -> %s\n", u_errorName(status)); 912 } 913 914 } 915 916 917 static void assertEqual(UCollationElements *i1, UCollationElements *i2) 918 { 919 int32_t c1, c2; 920 int32_t count = 0; 921 UErrorCode status = U_ZERO_ERROR; 922 923 do 924 { 925 c1 = ucol_next(i1, &status); 926 c2 = ucol_next(i2, &status); 927 928 if (c1 != c2) 929 { 930 log_err("Error in iteration %d assetEqual between\n %d and %d, they are not equal\n", count, c1, c2); 931 break; 932 } 933 934 count += 1; 935 } 936 while (c1 != UCOL_NULLORDER); 937 } 938 939 /** 940 * Testing iterators with extremely small buffers 941 */ 942 static void TestSmallBuffer() 943 { 944 UErrorCode status = U_ZERO_ERROR; 945 UCollator *coll; 946 UCollationElements *testiter, 947 *iter; 948 int32_t count = 0; 949 OrderAndOffset *testorders, 950 *orders; 951 952 UChar teststr[500]; 953 UChar str[] = {0x300, 0x31A, 0}; 954 /* 955 creating a long string of decomposable characters, 956 since by default the writable buffer is of size 256 957 */ 958 while (count < 500) { 959 if ((count & 1) == 0) { 960 teststr[count ++] = 0x300; 961 } 962 else { 963 teststr[count ++] = 0x31A; 964 } 965 } 966 967 coll = ucol_open("th_TH", &status); 968 if(U_SUCCESS(status) && coll) { 969 testiter = ucol_openElements(coll, teststr, 500, &status); 970 iter = ucol_openElements(coll, str, 2, &status); 971 972 orders = getOrders(iter, &count); 973 if (count != 2) { 974 log_err("Error collation elements size is not 2 for \\u0300\\u031A\n"); 975 } 976 977 /* 978 this will rearrange the string data to 250 characters of 0x300 first then 979 250 characters of 0x031A 980 */ 981 testorders = getOrders(testiter, &count); 982 983 if (count != 500) { 984 log_err("Error decomposition does not give the right sized collation elements\n"); 985 } 986 987 while (count != 0) { 988 /* UCA collation element for 0x0F76 */ 989 if ((count > 250 && testorders[-- count].order != orders[1].order) || 990 (count <= 250 && testorders[-- count].order != orders[0].order)) { 991 log_err("Error decomposition does not give the right collation element at %d count\n", count); 992 break; 993 } 994 } 995 996 free(testorders); 997 free(orders); 998 999 ucol_reset(testiter); 1000 1001 /* ensures closing of elements done properly to clear writable buffer */ 1002 ucol_next(testiter, &status); 1003 ucol_next(testiter, &status); 1004 ucol_closeElements(testiter); 1005 ucol_closeElements(iter); 1006 ucol_close(coll); 1007 } else { 1008 log_err_status(status, "Couldn't open collator -> %s\n", u_errorName(status)); 1009 } 1010 } 1011 1012 /** 1013 * Sniplets of code from genuca 1014 */ 1015 static int32_t hex2num(char hex) { 1016 if(hex>='0' && hex <='9') { 1017 return hex-'0'; 1018 } else if(hex>='a' && hex<='f') { 1019 return hex-'a'+10; 1020 } else if(hex>='A' && hex<='F') { 1021 return hex-'A'+10; 1022 } else { 1023 return 0; 1024 } 1025 } 1026 1027 /** 1028 * Getting codepoints from a string 1029 * @param str character string contain codepoints seperated by space and ended 1030 * by a semicolon 1031 * @param codepoints array for storage, assuming size > 5 1032 * @return position at the end of the codepoint section 1033 */ 1034 static char *getCodePoints(char *str, UChar *codepoints, UChar *contextCPs) { 1035 UErrorCode errorCode = U_ZERO_ERROR; 1036 char *semi = uprv_strchr(str, ';'); 1037 char *pipe = uprv_strchr(str, '|'); 1038 char *s; 1039 *codepoints = 0; 1040 *contextCPs = 0; 1041 if(semi == NULL) { 1042 log_err("expected semicolon after code point string in FractionalUCA.txt %s\n", str); 1043 return str; 1044 } 1045 if(pipe != NULL) { 1046 int32_t contextLength; 1047 *pipe = 0; 1048 contextLength = u_parseString(str, contextCPs, 99, NULL, &errorCode); 1049 *pipe = '|'; 1050 if(U_FAILURE(errorCode)) { 1051 log_err("error parsing precontext string from FractionalUCA.txt %s\n", str); 1052 return str; 1053 } 1054 /* prepend the precontext string to the codepoints */ 1055 u_memcpy(codepoints, contextCPs, contextLength); 1056 codepoints += contextLength; 1057 /* start of the code point string */ 1058 s = pipe + 1; 1059 } else { 1060 s = str; 1061 } 1062 u_parseString(s, codepoints, 99, NULL, &errorCode); 1063 if(U_FAILURE(errorCode)) { 1064 log_err("error parsing code point string from FractionalUCA.txt %s\n", str); 1065 return str; 1066 } 1067 return semi + 1; 1068 } 1069 1070 /** 1071 * Sniplets of code from genuca 1072 */ 1073 static int32_t 1074 readElement(char **from, char *to, char separator, UErrorCode *status) 1075 { 1076 if (U_SUCCESS(*status)) { 1077 char buffer[1024]; 1078 int32_t i = 0; 1079 while (**from != separator) { 1080 if (**from != ' ') { 1081 *(buffer+i++) = **from; 1082 } 1083 (*from)++; 1084 } 1085 (*from)++; 1086 *(buffer + i) = 0; 1087 strcpy(to, buffer); 1088 return i/2; 1089 } 1090 1091 return 0; 1092 } 1093 1094 /** 1095 * Sniplets of code from genuca 1096 */ 1097 static uint32_t 1098 getSingleCEValue(char *primary, char *secondary, char *tertiary, 1099 UErrorCode *status) 1100 { 1101 if (U_SUCCESS(*status)) { 1102 uint32_t value = 0; 1103 char primsave = '\0'; 1104 char secsave = '\0'; 1105 char tersave = '\0'; 1106 char *primend = primary+4; 1107 char *secend = secondary+2; 1108 char *terend = tertiary+2; 1109 uint32_t primvalue; 1110 uint32_t secvalue; 1111 uint32_t tervalue; 1112 1113 if (uprv_strlen(primary) > 4) { 1114 primsave = *primend; 1115 *primend = '\0'; 1116 } 1117 1118 if (uprv_strlen(secondary) > 2) { 1119 secsave = *secend; 1120 *secend = '\0'; 1121 } 1122 1123 if (uprv_strlen(tertiary) > 2) { 1124 tersave = *terend; 1125 *terend = '\0'; 1126 } 1127 1128 primvalue = (*primary!='\0')?uprv_strtoul(primary, &primend, 16):0; 1129 secvalue = (*secondary!='\0')?uprv_strtoul(secondary, &secend, 16):0; 1130 tervalue = (*tertiary!='\0')?uprv_strtoul(tertiary, &terend, 16):0; 1131 if(primvalue <= 0xFF) { 1132 primvalue <<= 8; 1133 } 1134 1135 value = ((primvalue << UCOL_PRIMARYORDERSHIFT) & UCOL_PRIMARYORDERMASK) 1136 | ((secvalue << UCOL_SECONDARYORDERSHIFT) & UCOL_SECONDARYORDERMASK) 1137 | (tervalue & UCOL_TERTIARYORDERMASK); 1138 1139 if(primsave!='\0') { 1140 *primend = primsave; 1141 } 1142 if(secsave!='\0') { 1143 *secend = secsave; 1144 } 1145 if(tersave!='\0') { 1146 *terend = tersave; 1147 } 1148 return value; 1149 } 1150 return 0; 1151 } 1152 1153 /** 1154 * Getting collation elements generated from a string 1155 * @param str character string contain collation elements contained in [] and 1156 * seperated by space 1157 * @param ce array for storage, assuming size > 20 1158 * @param status error status 1159 * @return position at the end of the codepoint section 1160 */ 1161 static char * getCEs(char *str, uint32_t *ces, UErrorCode *status) { 1162 char *pStartCP = uprv_strchr(str, '['); 1163 int count = 0; 1164 char *pEndCP; 1165 char primary[100]; 1166 char secondary[100]; 1167 char tertiary[100]; 1168 1169 while (*pStartCP == '[') { 1170 uint32_t primarycount = 0; 1171 uint32_t secondarycount = 0; 1172 uint32_t tertiarycount = 0; 1173 uint32_t CEi = 1; 1174 pEndCP = strchr(pStartCP, ']'); 1175 if(pEndCP == NULL) { 1176 break; 1177 } 1178 pStartCP ++; 1179 1180 primarycount = readElement(&pStartCP, primary, ',', status); 1181 secondarycount = readElement(&pStartCP, secondary, ',', status); 1182 tertiarycount = readElement(&pStartCP, tertiary, ']', status); 1183 1184 /* I want to get the CEs entered right here, including continuation */ 1185 ces[count ++] = getSingleCEValue(primary, secondary, tertiary, status); 1186 if (U_FAILURE(*status)) { 1187 break; 1188 } 1189 1190 while (2 * CEi < primarycount || CEi < secondarycount || 1191 CEi < tertiarycount) { 1192 uint32_t value = UCOL_CONTINUATION_MARKER; /* Continuation marker */ 1193 if (2 * CEi < primarycount) { 1194 value |= ((hex2num(*(primary + 4 * CEi)) & 0xF) << 28); 1195 value |= ((hex2num(*(primary + 4 * CEi + 1)) & 0xF) << 24); 1196 } 1197 1198 if (2 * CEi + 1 < primarycount) { 1199 value |= ((hex2num(*(primary + 4 * CEi + 2)) & 0xF) << 20); 1200 value |= ((hex2num(*(primary + 4 * CEi + 3)) &0xF) << 16); 1201 } 1202 1203 if (CEi < secondarycount) { 1204 value |= ((hex2num(*(secondary + 2 * CEi)) & 0xF) << 12); 1205 value |= ((hex2num(*(secondary + 2 * CEi + 1)) & 0xF) << 8); 1206 } 1207 1208 if (CEi < tertiarycount) { 1209 value |= ((hex2num(*(tertiary + 2 * CEi)) & 0x3) << 4); 1210 value |= (hex2num(*(tertiary + 2 * CEi + 1)) & 0xF); 1211 } 1212 1213 CEi ++; 1214 ces[count ++] = value; 1215 } 1216 1217 pStartCP = pEndCP + 1; 1218 } 1219 ces[count] = 0; 1220 return pStartCP; 1221 } 1222 1223 /** 1224 * Getting the FractionalUCA.txt file stream 1225 */ 1226 static FileStream * getFractionalUCA(void) 1227 { 1228 char newPath[256]; 1229 char backupPath[256]; 1230 FileStream *result = NULL; 1231 1232 /* Look inside ICU_DATA first */ 1233 uprv_strcpy(newPath, ctest_dataSrcDir()); 1234 uprv_strcat(newPath, "unidata" U_FILE_SEP_STRING ); 1235 uprv_strcat(newPath, "FractionalUCA.txt"); 1236 1237 /* As a fallback, try to guess where the source data was located 1238 * at the time ICU was built, and look there. 1239 */ 1240 #if defined (U_TOPSRCDIR) 1241 strcpy(backupPath, U_TOPSRCDIR U_FILE_SEP_STRING "data"); 1242 #else 1243 { 1244 UErrorCode errorCode = U_ZERO_ERROR; 1245 strcpy(backupPath, loadTestData(&errorCode)); 1246 strcat(backupPath, U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING "data"); 1247 } 1248 #endif 1249 strcat(backupPath, U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING "FractionalUCA.txt"); 1250 1251 result = T_FileStream_open(newPath, "rb"); 1252 1253 if (result == NULL) { 1254 result = T_FileStream_open(backupPath, "rb"); 1255 if (result == NULL) { 1256 log_err("Failed to open either %s or %s\n", newPath, backupPath); 1257 } 1258 } 1259 return result; 1260 } 1261 1262 /** 1263 * Testing the CEs returned by the iterator 1264 */ 1265 static void TestCEs() { 1266 FileStream *file = NULL; 1267 char line[2048]; 1268 char *str; 1269 UChar codepoints[10]; 1270 uint32_t ces[20]; 1271 UErrorCode status = U_ZERO_ERROR; 1272 UCollator *coll = ucol_open("", &status); 1273 uint32_t lineNo = 0; 1274 UChar contextCPs[5]; 1275 1276 if (U_FAILURE(status)) { 1277 log_err_status(status, "Error in opening root collator -> %s\n", u_errorName(status)); 1278 return; 1279 } 1280 1281 file = getFractionalUCA(); 1282 1283 if (file == NULL) { 1284 log_err("*** unable to open input FractionalUCA.txt file ***\n"); 1285 return; 1286 } 1287 1288 1289 while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) { 1290 int count = 0; 1291 UCollationElements *iter; 1292 int32_t preContextCeLen=0; 1293 lineNo++; 1294 /* skip this line if it is empty or a comment or is a return value 1295 or start of some variable section */ 1296 if(line[0] == 0 || line[0] == '#' || line[0] == '\n' || 1297 line[0] == 0x000D || line[0] == '[') { 1298 continue; 1299 } 1300 1301 str = getCodePoints(line, codepoints, contextCPs); 1302 1303 /* these are 'fake' codepoints in the fractional UCA, and are used just 1304 * for positioning of indirect values. They should not go through this 1305 * test. 1306 */ 1307 if(*codepoints == 0xFDD0) { 1308 continue; 1309 } 1310 if (*contextCPs != 0) { 1311 iter = ucol_openElements(coll, contextCPs, -1, &status); 1312 if (U_FAILURE(status)) { 1313 log_err("Error in opening collation elements\n"); 1314 break; 1315 } 1316 while((ces[preContextCeLen] = ucol_next(iter, &status)) != (uint32_t)UCOL_NULLORDER) { 1317 preContextCeLen++; 1318 } 1319 ucol_closeElements(iter); 1320 } 1321 1322 getCEs(str, ces+preContextCeLen, &status); 1323 if (U_FAILURE(status)) { 1324 log_err("Error in parsing collation elements in FractionalUCA.txt\n"); 1325 break; 1326 } 1327 iter = ucol_openElements(coll, codepoints, -1, &status); 1328 if (U_FAILURE(status)) { 1329 log_err("Error in opening collation elements\n"); 1330 break; 1331 } 1332 for (;;) { 1333 uint32_t ce = (uint32_t)ucol_next(iter, &status); 1334 if (ce == 0xFFFFFFFF) { 1335 ce = 0; 1336 } 1337 /* we now unconditionally reorder Thai/Lao prevowels, so this 1338 * test would fail if we don't skip here. 1339 */ 1340 if(UCOL_ISTHAIPREVOWEL(*codepoints) && ce == 0 && count == 0) { 1341 continue; 1342 } 1343 if (ce != ces[count] || U_FAILURE(status)) { 1344 log_err("Collation elements in FractionalUCA.txt and iterators do not match!\n"); 1345 break; 1346 } 1347 if (ces[count] == 0) { 1348 break; 1349 } 1350 count ++; 1351 } 1352 ucol_closeElements(iter); 1353 } 1354 1355 T_FileStream_close(file); 1356 ucol_close(coll); 1357 } 1358 1359 /** 1360 * Testing the discontigous contractions 1361 */ 1362 static void TestDiscontiguos() { 1363 const char *rulestr = 1364 "&z < AB < X\\u0300 < ABC < X\\u0300\\u0315"; 1365 UChar rule[50]; 1366 int rulelen = u_unescape(rulestr, rule, 50); 1367 const char *src[] = { 1368 "ADB", "ADBC", "A\\u0315B", "A\\u0315BC", 1369 /* base character blocked */ 1370 "XD\\u0300", "XD\\u0300\\u0315", 1371 /* non blocking combining character */ 1372 "X\\u0319\\u0300", "X\\u0319\\u0300\\u0315", 1373 /* blocking combining character */ 1374 "X\\u0314\\u0300", "X\\u0314\\u0300\\u0315", 1375 /* contraction prefix */ 1376 "ABDC", "AB\\u0315C","X\\u0300D\\u0315", "X\\u0300\\u0319\\u0315", 1377 "X\\u0300\\u031A\\u0315", 1378 /* ends not with a contraction character */ 1379 "X\\u0319\\u0300D", "X\\u0319\\u0300\\u0315D", "X\\u0300D\\u0315D", 1380 "X\\u0300\\u0319\\u0315D", "X\\u0300\\u031A\\u0315D" 1381 }; 1382 const char *tgt[] = { 1383 /* non blocking combining character */ 1384 "A D B", "A D BC", "A \\u0315 B", "A \\u0315 BC", 1385 /* base character blocked */ 1386 "X D \\u0300", "X D \\u0300\\u0315", 1387 /* non blocking combining character */ 1388 "X\\u0300 \\u0319", "X\\u0300\\u0315 \\u0319", 1389 /* blocking combining character */ 1390 "X \\u0314 \\u0300", "X \\u0314 \\u0300\\u0315", 1391 /* contraction prefix */ 1392 "AB DC", "AB \\u0315 C","X\\u0300 D \\u0315", "X\\u0300\\u0315 \\u0319", 1393 "X\\u0300 \\u031A \\u0315", 1394 /* ends not with a contraction character */ 1395 "X\\u0300 \\u0319D", "X\\u0300\\u0315 \\u0319D", "X\\u0300 D\\u0315D", 1396 "X\\u0300\\u0315 \\u0319D", "X\\u0300 \\u031A\\u0315D" 1397 }; 1398 int size = 20; 1399 UCollator *coll; 1400 UErrorCode status = U_ZERO_ERROR; 1401 int count = 0; 1402 UCollationElements *iter; 1403 UCollationElements *resultiter; 1404 1405 coll = ucol_openRules(rule, rulelen, UCOL_OFF, UCOL_DEFAULT_STRENGTH,NULL, &status); 1406 iter = ucol_openElements(coll, rule, 1, &status); 1407 resultiter = ucol_openElements(coll, rule, 1, &status); 1408 1409 if (U_FAILURE(status)) { 1410 log_err_status(status, "Error opening collation rules -> %s\n", u_errorName(status)); 1411 return; 1412 } 1413 1414 while (count < size) { 1415 UChar str[20]; 1416 UChar tstr[20]; 1417 int strLen = u_unescape(src[count], str, 20); 1418 UChar *s; 1419 1420 ucol_setText(iter, str, strLen, &status); 1421 if (U_FAILURE(status)) { 1422 log_err("Error opening collation iterator\n"); 1423 return; 1424 } 1425 1426 u_unescape(tgt[count], tstr, 20); 1427 s = tstr; 1428 1429 log_verbose("count %d\n", count); 1430 1431 for (;;) { 1432 uint32_t ce; 1433 UChar *e = u_strchr(s, 0x20); 1434 if (e == 0) { 1435 e = u_strchr(s, 0); 1436 } 1437 ucol_setText(resultiter, s, (int32_t)(e - s), &status); 1438 ce = ucol_next(resultiter, &status); 1439 if (U_FAILURE(status)) { 1440 log_err("Error manipulating collation iterator\n"); 1441 return; 1442 } 1443 while (ce != UCOL_NULLORDER) { 1444 if (ce != (uint32_t)ucol_next(iter, &status) || 1445 U_FAILURE(status)) { 1446 log_err("Discontiguos contraction test mismatch\n"); 1447 return; 1448 } 1449 ce = ucol_next(resultiter, &status); 1450 if (U_FAILURE(status)) { 1451 log_err("Error getting next collation element\n"); 1452 return; 1453 } 1454 } 1455 s = e + 1; 1456 if (*e == 0) { 1457 break; 1458 } 1459 } 1460 ucol_reset(iter); 1461 backAndForth(iter); 1462 count ++; 1463 } 1464 ucol_closeElements(resultiter); 1465 ucol_closeElements(iter); 1466 ucol_close(coll); 1467 } 1468 1469 static void TestCEBufferOverflow() 1470 { 1471 UChar str[UCOL_EXPAND_CE_BUFFER_SIZE + 1]; 1472 UErrorCode status = U_ZERO_ERROR; 1473 UChar rule[10]; 1474 UCollator *coll; 1475 UCollationElements *iter; 1476 1477 u_uastrcpy(rule, "&z < AB"); 1478 coll = ucol_openRules(rule, u_strlen(rule), UCOL_OFF, UCOL_DEFAULT_STRENGTH, NULL,&status); 1479 if (U_FAILURE(status)) { 1480 log_err_status(status, "Rule based collator not created for testing ce buffer overflow -> %s\n", u_errorName(status)); 1481 return; 1482 } 1483 1484 /* 0xDCDC is a trail surrogate hence deemed unsafe by the heuristic 1485 test. this will cause an overflow in getPrev */ 1486 str[0] = 0x0041; /* 'A' */ 1487 /*uprv_memset(str + 1, 0xE0, sizeof(UChar) * UCOL_EXPAND_CE_BUFFER_SIZE);*/ 1488 uprv_memset(str + 1, 0xDC, sizeof(UChar) * UCOL_EXPAND_CE_BUFFER_SIZE); 1489 str[UCOL_EXPAND_CE_BUFFER_SIZE] = 0x0042; /* 'B' */ 1490 iter = ucol_openElements(coll, str, UCOL_EXPAND_CE_BUFFER_SIZE + 1, 1491 &status); 1492 if (ucol_previous(iter, &status) == UCOL_NULLORDER || 1493 status == U_BUFFER_OVERFLOW_ERROR) { 1494 log_err("CE buffer should not overflow with long string of trail surrogates\n"); 1495 } 1496 ucol_closeElements(iter); 1497 ucol_close(coll); 1498 } 1499 1500 /** 1501 * Checking collation element validity. 1502 */ 1503 #define MAX_CODEPOINTS_TO_SHOW 10 1504 static void showCodepoints(const UChar *codepoints, int length, char * codepointText) { 1505 int i, lengthToUse = length; 1506 if (lengthToUse > MAX_CODEPOINTS_TO_SHOW) { 1507 lengthToUse = MAX_CODEPOINTS_TO_SHOW; 1508 } 1509 for (i = 0; i < lengthToUse; ++i) { 1510 int bytesWritten = sprintf(codepointText, " %04X", *codepoints++); 1511 if (bytesWritten <= 0) { 1512 break; 1513 } 1514 codepointText += bytesWritten; 1515 } 1516 if (i < length) { 1517 sprintf(codepointText, " ..."); 1518 } 1519 } 1520 1521 static UBool checkCEValidity(const UCollator *coll, const UChar *codepoints, 1522 int length) 1523 { 1524 UErrorCode status = U_ZERO_ERROR; 1525 UCollationElements *iter = ucol_openElements(coll, codepoints, length, 1526 &status); 1527 UBool result = FALSE; 1528 UBool primaryDone = FALSE, secondaryDone = FALSE, tertiaryDone = FALSE; 1529 const char * collLocale; 1530 1531 if (U_FAILURE(status)) { 1532 log_err("Error creating iterator for testing validity\n"); 1533 return FALSE; 1534 } 1535 collLocale = ucol_getLocale(coll, ULOC_VALID_LOCALE, &status); 1536 if (U_FAILURE(status) || collLocale==NULL) { 1537 status = U_ZERO_ERROR; 1538 collLocale = "?"; 1539 } 1540 1541 for (;;) { 1542 uint32_t ce = ucol_next(iter, &status); 1543 uint32_t primary, p1, p2, secondary, tertiary; 1544 if (ce == UCOL_NULLORDER) { 1545 result = TRUE; 1546 break; 1547 } 1548 if (ce == 0) { 1549 continue; 1550 } 1551 if (ce == 0x02000202) { 1552 /* special CE for merge-sort character */ 1553 if (*codepoints == 0xFFFE /* && length == 1 */) { 1554 /* 1555 * Note: We should check for length==1 but the token parser appears 1556 * to give us trailing NUL characters. 1557 * TODO: Ticket #8047: Change TestCEValidity to use ucol_getTailoredSet() 1558 * rather than the internal collation rule parser 1559 */ 1560 continue; 1561 } else { 1562 log_err("Special 02/02/02 weight for code point U+%04X [len %d] != U+FFFE\n", 1563 (int)*codepoints, (int)length); 1564 break; 1565 } 1566 } 1567 primary = UCOL_PRIMARYORDER(ce); 1568 p1 = primary >> 8; 1569 p2 = primary & 0xFF; 1570 secondary = UCOL_SECONDARYORDER(ce); 1571 tertiary = UCOL_TERTIARYORDER(ce) & UCOL_REMOVE_CONTINUATION; 1572 1573 if (!isContinuation(ce)) { 1574 if ((ce & UCOL_REMOVE_CONTINUATION) == 0) { 1575 log_err("Empty CE %08lX except for case bits\n", (long)ce); 1576 break; 1577 } 1578 if (p1 == 0) { 1579 if (p2 != 0) { 1580 log_err("Primary 00 xx in %08lX\n", (long)ce); 1581 break; 1582 } 1583 primaryDone = TRUE; 1584 } else { 1585 if (p1 <= 2 || p1 >= 0xF0) { 1586 /* Primary first bytes F0..FF are specials. */ 1587 log_err("Primary first byte of %08lX out of range\n", (long)ce); 1588 break; 1589 } 1590 if (p2 == 0) { 1591 primaryDone = TRUE; 1592 } else { 1593 if (p2 <= 3 || p2 >= 0xFF) { 1594 /* Primary second bytes 03 and FF are sort key compression terminators. */ 1595 log_err("Primary second byte of %08lX out of range\n", (long)ce); 1596 break; 1597 } 1598 primaryDone = FALSE; 1599 } 1600 } 1601 if (secondary == 0) { 1602 if (primary != 0) { 1603 log_err("Primary!=0 secondary==0 in %08lX\n", (long)ce); 1604 break; 1605 } 1606 secondaryDone = TRUE; 1607 } else { 1608 if (secondary <= 2 || 1609 (UCOL_BYTE_COMMON < secondary && secondary <= (UCOL_BYTE_COMMON + 0x80)) 1610 ) { 1611 /* Secondary first bytes common+1..+0x80 are used for sort key compression. */ 1612 log_err("Secondary byte of %08lX out of range\n", (long)ce); 1613 break; 1614 } 1615 secondaryDone = FALSE; 1616 } 1617 if (tertiary == 0) { 1618 /* We know that ce != 0. */ 1619 log_err("Primary!=0 or secondary!=0 but tertiary==0 in %08lX\n", (long)ce); 1620 break; 1621 } 1622 if (tertiary <= 2) { 1623 log_err("Tertiary byte of %08lX out of range\n", (long)ce); 1624 break; 1625 } 1626 tertiaryDone = FALSE; 1627 } else { 1628 if ((ce & UCOL_REMOVE_CONTINUATION) == 0) { 1629 log_err("Empty continuation %08lX\n", (long)ce); 1630 break; 1631 } 1632 if (primaryDone && primary != 0) { 1633 log_err("Primary was done but continues in %08lX\n", (long)ce); 1634 break; 1635 } 1636 if (p1 == 0) { 1637 if (p2 != 0) { 1638 log_err("Primary 00 xx in %08lX\n", (long)ce); 1639 break; 1640 } 1641 primaryDone = TRUE; 1642 } else { 1643 if (p1 <= 2) { 1644 log_err("Primary first byte of %08lX out of range\n", (long)ce); 1645 break; 1646 } 1647 if (p2 == 0) { 1648 primaryDone = TRUE; 1649 } else { 1650 if (p2 <= 3) { 1651 log_err("Primary second byte of %08lX out of range\n", (long)ce); 1652 break; 1653 } 1654 } 1655 } 1656 if (secondaryDone && secondary != 0) { 1657 log_err("Secondary was done but continues in %08lX\n", (long)ce); 1658 break; 1659 } 1660 if (secondary == 0) { 1661 secondaryDone = TRUE; 1662 } else { 1663 if (secondary <= 2) { 1664 log_err("Secondary byte of %08lX out of range\n", (long)ce); 1665 break; 1666 } 1667 } 1668 if (tertiaryDone && tertiary != 0) { 1669 log_err("Tertiary was done but continues in %08lX\n", (long)ce); 1670 break; 1671 } 1672 if (tertiary == 0) { 1673 tertiaryDone = TRUE; 1674 } else if (tertiary <= 2) { 1675 log_err("Tertiary byte of %08lX out of range\n", (long)ce); 1676 break; 1677 } 1678 } 1679 } 1680 if (!result) { 1681 char codepointText[5*MAX_CODEPOINTS_TO_SHOW + 5]; 1682 showCodepoints(codepoints, length, codepointText); 1683 log_err("Locale: %s Code point string: %s\n", collLocale, codepointText); 1684 } 1685 ucol_closeElements(iter); 1686 return result; 1687 } 1688 1689 static const UChar IMPORT[] = { 0x5B, 0x69, 0x6D, 0x70, 0x6F, 0x72, 0x74, 0 }; /* "[import" */ 1690 1691 static void TestCEValidity() 1692 { 1693 /* testing UCA collation elements */ 1694 UErrorCode status = U_ZERO_ERROR; 1695 /* en_US has no tailorings */ 1696 UCollator *coll = ucol_open("root", &status); 1697 /* tailored locales */ 1698 char locale[][11] = {"fr_FR", "ko_KR", "sh_YU", "th_TH", "zh_CN", "zh__PINYIN"}; 1699 const char *loc; 1700 FileStream *file = NULL; 1701 char line[2048]; 1702 UChar codepoints[11]; 1703 int count = 0; 1704 int maxCount = 0; 1705 UChar contextCPs[3]; 1706 UChar32 c; 1707 UParseError parseError; 1708 if (U_FAILURE(status)) { 1709 log_err_status(status, "en_US collator creation failed -> %s\n", u_errorName(status)); 1710 return; 1711 } 1712 log_verbose("Testing UCA elements\n"); 1713 file = getFractionalUCA(); 1714 if (file == NULL) { 1715 log_err("Fractional UCA data can not be opened\n"); 1716 return; 1717 } 1718 1719 while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) { 1720 if(line[0] == 0 || line[0] == '#' || line[0] == '\n' || 1721 line[0] == 0x000D || line[0] == '[') { 1722 continue; 1723 } 1724 1725 getCodePoints(line, codepoints, contextCPs); 1726 checkCEValidity(coll, codepoints, u_strlen(codepoints)); 1727 } 1728 1729 log_verbose("Testing UCA elements for the whole range of unicode characters\n"); 1730 for (c = 0; c <= 0xffff; ++c) { 1731 if (u_isdefined(c)) { 1732 codepoints[0] = (UChar)c; 1733 checkCEValidity(coll, codepoints, 1); 1734 } 1735 } 1736 for (; c <= 0x10ffff; ++c) { 1737 if (u_isdefined(c)) { 1738 int32_t i = 0; 1739 U16_APPEND_UNSAFE(codepoints, i, c); 1740 checkCEValidity(coll, codepoints, i); 1741 } 1742 } 1743 1744 ucol_close(coll); 1745 1746 /* testing tailored collation elements */ 1747 log_verbose("Testing tailored elements\n"); 1748 if(getTestOption(QUICK_OPTION)) { 1749 maxCount = sizeof(locale)/sizeof(locale[0]); 1750 } else { 1751 maxCount = uloc_countAvailable(); 1752 } 1753 while (count < maxCount) { 1754 const UChar *rules = NULL, 1755 *current = NULL; 1756 UChar *rulesCopy = NULL; 1757 int32_t ruleLen = 0; 1758 1759 uint32_t chOffset = 0; 1760 uint32_t chLen = 0; 1761 uint32_t exOffset = 0; 1762 uint32_t exLen = 0; 1763 uint32_t prefixOffset = 0; 1764 uint32_t prefixLen = 0; 1765 UBool startOfRules = TRUE; 1766 UColOptionSet opts; 1767 1768 UColTokenParser src; 1769 uint32_t strength = 0; 1770 uint16_t specs = 0; 1771 if(getTestOption(QUICK_OPTION)) { 1772 loc = locale[count]; 1773 } else { 1774 loc = uloc_getAvailable(count); 1775 if(!hasCollationElements(loc)) { 1776 count++; 1777 continue; 1778 } 1779 } 1780 status = U_ZERO_ERROR; // clear status from previous loop iteration 1781 1782 uprv_memset(&src, 0, sizeof(UColTokenParser)); 1783 1784 log_verbose("Testing CEs for %s\n", loc); 1785 1786 coll = ucol_open(loc, &status); 1787 if (U_FAILURE(status)) { 1788 log_err("%s collator creation failed with status %s\n", loc, u_errorName(status)); 1789 return; 1790 } 1791 1792 src.opts = &opts; 1793 rules = ucol_getRules(coll, &ruleLen); 1794 1795 /* 1796 * We have not set up the UColTokenParser with a callback function 1797 * to fetch [import] sub-rules, 1798 * so skip testing tailorings that import others. 1799 * TODO: Ticket #8047: Change TestCEValidity to use ucol_getTailoredSet() 1800 * rather than the internal collation rule parser 1801 */ 1802 if (ruleLen > 0 && u_strstr(rules, IMPORT) == NULL) { 1803 rulesCopy = (UChar *)uprv_malloc((ruleLen + 1804 UCOL_TOK_EXTRA_RULE_SPACE_SIZE) * sizeof(UChar)); 1805 uprv_memcpy(rulesCopy, rules, ruleLen * sizeof(UChar)); 1806 src.current = src.source = rulesCopy; 1807 src.end = rulesCopy + ruleLen; 1808 src.extraCurrent = src.end; 1809 src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE; 1810 1811 /* Note that as a result of tickets 7015 or 6912, ucol_tok_parseNextToken can cause the pointer to 1812 the rules copy in src.source to get reallocated, freeing the original pointer in rulesCopy */ 1813 while ((current = ucol_tok_parseNextToken(&src, startOfRules, &parseError,&status)) != NULL && U_SUCCESS(status)) { 1814 strength = src.parsedToken.strength; 1815 chOffset = src.parsedToken.charsOffset; 1816 chLen = src.parsedToken.charsLen; 1817 exOffset = src.parsedToken.extensionOffset; 1818 exLen = src.parsedToken.extensionLen; 1819 prefixOffset = src.parsedToken.prefixOffset; 1820 prefixLen = src.parsedToken.prefixLen; 1821 specs = src.parsedToken.flags; 1822 1823 startOfRules = FALSE; 1824 uprv_memcpy(codepoints, src.source + chOffset, 1825 chLen * sizeof(UChar)); 1826 codepoints[chLen] = 0; 1827 checkCEValidity(coll, codepoints, chLen); 1828 } 1829 if (U_FAILURE(status)) { 1830 log_err("%s collator, ucol_tok_parseNextToken failed with status %s\n", loc, u_errorName(status)); 1831 } 1832 uprv_free(src.source); 1833 uprv_free(src.reorderCodes); 1834 } 1835 1836 ucol_close(coll); 1837 count ++; 1838 } 1839 T_FileStream_close(file); 1840 } 1841 1842 static void printSortKeyError(const UChar *codepoints, int length, 1843 uint8_t *sortkey, int sklen) 1844 { 1845 int count = 0; 1846 log_err("Sortkey not valid for "); 1847 while (length > 0) { 1848 log_err("0x%04x ", *codepoints); 1849 length --; 1850 codepoints ++; 1851 } 1852 log_err("\nSortkey : "); 1853 while (count < sklen) { 1854 log_err("0x%02x ", sortkey[count]); 1855 count ++; 1856 } 1857 log_err("\n"); 1858 } 1859 1860 /** 1861 * Checking sort key validity for all levels 1862 */ 1863 static UBool checkSortKeyValidity(UCollator *coll, 1864 const UChar *codepoints, 1865 int length) 1866 { 1867 UErrorCode status = U_ZERO_ERROR; 1868 UCollationStrength strength[5] = {UCOL_PRIMARY, UCOL_SECONDARY, 1869 UCOL_TERTIARY, UCOL_QUATERNARY, 1870 UCOL_IDENTICAL}; 1871 int strengthlen = 5; 1872 int strengthIndex = 0; 1873 int caselevel = 0; 1874 1875 while (caselevel < 1) { 1876 if (caselevel == 0) { 1877 ucol_setAttribute(coll, UCOL_CASE_LEVEL, UCOL_OFF, &status); 1878 } 1879 else { 1880 ucol_setAttribute(coll, UCOL_CASE_LEVEL, UCOL_ON, &status); 1881 } 1882 1883 while (strengthIndex < strengthlen) { 1884 int count01 = 0; 1885 uint32_t count = 0; 1886 uint8_t sortkey[128]; 1887 uint32_t sklen; 1888 1889 ucol_setStrength(coll, strength[strengthIndex]); 1890 sklen = ucol_getSortKey(coll, codepoints, length, sortkey, 128); 1891 while (sortkey[count] != 0) { 1892 if (sortkey[count] == 2 || (sortkey[count] == 3 && count01 > 0 && strengthIndex != 4)) { 1893 printSortKeyError(codepoints, length, sortkey, sklen); 1894 return FALSE; 1895 } 1896 if (sortkey[count] == 1) { 1897 count01 ++; 1898 } 1899 count ++; 1900 } 1901 1902 if (count + 1 != sklen || (count01 != strengthIndex + caselevel)) { 1903 printSortKeyError(codepoints, length, sortkey, sklen); 1904 return FALSE; 1905 } 1906 strengthIndex ++; 1907 } 1908 caselevel ++; 1909 } 1910 return TRUE; 1911 } 1912 1913 static void TestSortKeyValidity(void) 1914 { 1915 /* testing UCA collation elements */ 1916 UErrorCode status = U_ZERO_ERROR; 1917 /* en_US has no tailorings */ 1918 UCollator *coll = ucol_open("en_US", &status); 1919 /* tailored locales */ 1920 char locale[][6] = {"fr_FR", "ko_KR", "sh_YU", "th_TH", "zh_CN"}; 1921 FileStream *file = NULL; 1922 char line[2048]; 1923 UChar codepoints[10]; 1924 int count = 0; 1925 UChar contextCPs[5]; 1926 UParseError parseError; 1927 if (U_FAILURE(status)) { 1928 log_err_status(status, "en_US collator creation failed -> %s\n", u_errorName(status)); 1929 return; 1930 } 1931 log_verbose("Testing UCA elements\n"); 1932 file = getFractionalUCA(); 1933 if (file == NULL) { 1934 log_err("Fractional UCA data can not be opened\n"); 1935 return; 1936 } 1937 1938 while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) { 1939 if(line[0] == 0 || line[0] == '#' || line[0] == '\n' || 1940 line[0] == 0x000D || line[0] == '[') { 1941 continue; 1942 } 1943 1944 getCodePoints(line, codepoints, contextCPs); 1945 if(codepoints[0] == 0xFFFE) { 1946 /* Skip special merge-sort character U+FFFE which has otherwise illegal 02 weight bytes. */ 1947 continue; 1948 } 1949 checkSortKeyValidity(coll, codepoints, u_strlen(codepoints)); 1950 } 1951 1952 log_verbose("Testing UCA elements for the whole range of unicode characters\n"); 1953 codepoints[0] = 0; 1954 1955 while (codepoints[0] < 0xFFFF) { 1956 if (u_isdefined((UChar32)codepoints[0])) { 1957 checkSortKeyValidity(coll, codepoints, 1); 1958 } 1959 codepoints[0] ++; 1960 } 1961 1962 ucol_close(coll); 1963 1964 /* testing tailored collation elements */ 1965 log_verbose("Testing tailored elements\n"); 1966 while (count < 5) { 1967 const UChar *rules = NULL, 1968 *current = NULL; 1969 UChar *rulesCopy = NULL; 1970 int32_t ruleLen = 0; 1971 1972 uint32_t chOffset = 0; 1973 uint32_t chLen = 0; 1974 uint32_t exOffset = 0; 1975 uint32_t exLen = 0; 1976 uint32_t prefixOffset = 0; 1977 uint32_t prefixLen = 0; 1978 UBool startOfRules = TRUE; 1979 UColOptionSet opts; 1980 1981 UColTokenParser src; 1982 uint32_t strength = 0; 1983 uint16_t specs = 0; 1984 status = U_ZERO_ERROR; // clear status from previous loop iteration 1985 1986 uprv_memset(&src, 0, sizeof(UColTokenParser)); 1987 1988 coll = ucol_open(locale[count], &status); 1989 if (U_FAILURE(status)) { 1990 log_err("%s collator creation failed with status %s\n", locale[count], u_errorName(status)); 1991 return; 1992 } 1993 1994 src.opts = &opts; 1995 rules = ucol_getRules(coll, &ruleLen); 1996 1997 /* 1998 * We have not set up the UColTokenParser with a callback function 1999 * to fetch [import] sub-rules, 2000 * so skip testing tailorings that import others. 2001 * TODO: Ticket #8047: Change TestSortKeyValidity to use ucol_getTailoredSet() 2002 * rather than the internal collation rule parser 2003 */ 2004 if (ruleLen > 0 && u_strstr(rules, IMPORT) == NULL) { 2005 rulesCopy = (UChar *)uprv_malloc((ruleLen + 2006 UCOL_TOK_EXTRA_RULE_SPACE_SIZE) * sizeof(UChar)); 2007 uprv_memcpy(rulesCopy, rules, ruleLen * sizeof(UChar)); 2008 src.current = src.source = rulesCopy; 2009 src.end = rulesCopy + ruleLen; 2010 src.extraCurrent = src.end; 2011 src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE; 2012 2013 /* Note that as a result of tickets 7015 or 6912, ucol_tok_parseNextToken can cause the pointer to 2014 the rules copy in src.source to get reallocated, freeing the original pointer in rulesCopy */ 2015 while ((current = ucol_tok_parseNextToken(&src, startOfRules,&parseError, &status)) != NULL && U_SUCCESS(status)) { 2016 strength = src.parsedToken.strength; 2017 chOffset = src.parsedToken.charsOffset; 2018 chLen = src.parsedToken.charsLen; 2019 exOffset = src.parsedToken.extensionOffset; 2020 exLen = src.parsedToken.extensionLen; 2021 prefixOffset = src.parsedToken.prefixOffset; 2022 prefixLen = src.parsedToken.prefixLen; 2023 specs = src.parsedToken.flags; 2024 2025 startOfRules = FALSE; 2026 uprv_memcpy(codepoints, src.source + chOffset, 2027 chLen * sizeof(UChar)); 2028 codepoints[chLen] = 0; 2029 if(codepoints[0] == 0xFFFE) { 2030 /* Skip special merge-sort character U+FFFE which has otherwise illegal 02 weight bytes. */ 2031 continue; 2032 } 2033 checkSortKeyValidity(coll, codepoints, chLen); 2034 } 2035 if (U_FAILURE(status)) { 2036 log_err("%s collator, ucol_tok_parseNextToken failed with status %s\n", locale[count], u_errorName(status)); 2037 } 2038 uprv_free(src.source); 2039 uprv_free(src.reorderCodes); 2040 } 2041 2042 ucol_close(coll); 2043 count ++; 2044 } 2045 T_FileStream_close(file); 2046 } 2047 2048 /** 2049 * TestSearchCollatorElements tests iterator behavior (forwards and backwards) with 2050 * normalization on AND jamo tailoring, among other things. 2051 */ 2052 static const UChar tsceText[] = { /* Nothing in here should be ignorable */ 2053 0x0020, 0xAC00, /* simple LV Hangul */ 2054 0x0020, 0xAC01, /* simple LVT Hangul */ 2055 0x0020, 0xAC0F, /* LVTT, last jamo expands for search */ 2056 0x0020, 0xAFFF, /* LLVVVTT, every jamo expands for search */ 2057 0x0020, 0x1100, 0x1161, 0x11A8, /* 0xAC01 as conjoining jamo */ 2058 0x0020, 0x3131, 0x314F, 0x3131, /* 0xAC01 as compatibility jamo */ 2059 0x0020, 0x1100, 0x1161, 0x11B6, /* 0xAC0F as conjoining jamo; last expands for search */ 2060 0x0020, 0x1101, 0x1170, 0x11B6, /* 0xAFFF as conjoining jamo; all expand for search */ 2061 0x0020, 0x00E6, /* small letter ae, expands */ 2062 0x0020, 0x1E4D, /* small letter o with tilde and acute, decomposes */ 2063 0x0020 2064 }; 2065 enum { kLen_tsceText = sizeof(tsceText)/sizeof(tsceText[0]) }; 2066 2067 static const int32_t rootStandardOffsets[] = { 2068 0, 1,2, 2069 2, 3,4,4, 2070 4, 5,6,6, 2071 6, 7,8,8, 2072 8, 9,10,11, 2073 12, 13,14,15, 2074 16, 17,18,19, 2075 20, 21,22,23, 2076 24, 25,26,26,26, 2077 26, 27,28,28, 2078 28, 2079 29 2080 }; 2081 enum { kLen_rootStandardOffsets = sizeof(rootStandardOffsets)/sizeof(rootStandardOffsets[0]) }; 2082 2083 static const int32_t rootSearchOffsets[] = { 2084 0, 1,2, 2085 2, 3,4,4, 2086 4, 5,6,6,6, 2087 6, 7,8,8,8,8,8,8, 2088 8, 9,10,11, 2089 12, 13,14,15, 2090 16, 17,18,19,20, 2091 20, 21,22,22,23,23,23,24, 2092 24, 25,26,26,26, 2093 26, 27,28,28, 2094 28, 2095 29 2096 }; 2097 enum { kLen_rootSearchOffsets = sizeof(rootSearchOffsets)/sizeof(rootSearchOffsets[0]) }; 2098 2099 typedef struct { 2100 const char * locale; 2101 const int32_t * offsets; 2102 int32_t offsetsLen; 2103 } TSCEItem; 2104 2105 static const TSCEItem tsceItems[] = { 2106 { "root", rootStandardOffsets, kLen_rootStandardOffsets }, 2107 { "root@collation=search", rootSearchOffsets, kLen_rootSearchOffsets }, 2108 { NULL, NULL, 0 } 2109 }; 2110 2111 static void TestSearchCollatorElements(void) 2112 { 2113 const TSCEItem * tsceItemPtr; 2114 for (tsceItemPtr = tsceItems; tsceItemPtr->locale != NULL; tsceItemPtr++) { 2115 UErrorCode status = U_ZERO_ERROR; 2116 UCollator* ucol = ucol_open(tsceItemPtr->locale, &status); 2117 if ( U_SUCCESS(status) ) { 2118 UCollationElements * uce = ucol_openElements(ucol, tsceText, kLen_tsceText, &status); 2119 if ( U_SUCCESS(status) ) { 2120 int32_t offset, element; 2121 const int32_t * nextOffsetPtr; 2122 const int32_t * limitOffsetPtr; 2123 2124 nextOffsetPtr = tsceItemPtr->offsets; 2125 limitOffsetPtr = tsceItemPtr->offsets + tsceItemPtr->offsetsLen; 2126 do { 2127 offset = ucol_getOffset(uce); 2128 element = ucol_next(uce, &status); 2129 if ( element == 0 ) { 2130 log_err("error, locale %s, ucol_next returned element 0\n", tsceItemPtr->locale ); 2131 } 2132 if ( nextOffsetPtr < limitOffsetPtr ) { 2133 if (offset != *nextOffsetPtr) { 2134 log_err("error, locale %s, expected ucol_next -> ucol_getOffset %d, got %d\n", 2135 tsceItemPtr->locale, *nextOffsetPtr, offset ); 2136 nextOffsetPtr = limitOffsetPtr; 2137 break; 2138 } 2139 nextOffsetPtr++; 2140 } else { 2141 log_err("error, locale %s, ucol_next returned more elements than expected\n", tsceItemPtr->locale ); 2142 } 2143 } while ( U_SUCCESS(status) && element != UCOL_NULLORDER ); 2144 if ( nextOffsetPtr < limitOffsetPtr ) { 2145 log_err("error, locale %s, ucol_next returned fewer elements than expected\n", tsceItemPtr->locale ); 2146 } 2147 2148 ucol_setOffset(uce, kLen_tsceText, &status); 2149 status = U_ZERO_ERROR; 2150 nextOffsetPtr = tsceItemPtr->offsets + tsceItemPtr->offsetsLen; 2151 limitOffsetPtr = tsceItemPtr->offsets; 2152 do { 2153 offset = ucol_getOffset(uce); 2154 element = ucol_previous(uce, &status); 2155 if ( element == 0 ) { 2156 log_err("error, locale %s, ucol_previous returned element 0\n", tsceItemPtr->locale ); 2157 } 2158 if ( nextOffsetPtr > limitOffsetPtr ) { 2159 nextOffsetPtr--; 2160 if (offset != *nextOffsetPtr) { 2161 log_err("error, locale %s, expected ucol_previous -> ucol_getOffset %d, got %d\n", 2162 tsceItemPtr->locale, *nextOffsetPtr, offset ); 2163 nextOffsetPtr = limitOffsetPtr; 2164 break; 2165 } 2166 } else { 2167 log_err("error, locale %s, ucol_previous returned more elements than expected\n", tsceItemPtr->locale ); 2168 } 2169 } while ( U_SUCCESS(status) && element != UCOL_NULLORDER ); 2170 if ( nextOffsetPtr > limitOffsetPtr ) { 2171 log_err("error, locale %s, ucol_previous returned fewer elements than expected\n", tsceItemPtr->locale ); 2172 } 2173 2174 ucol_closeElements(uce); 2175 } else { 2176 log_err("error, locale %s, ucol_openElements failed: %s\n", tsceItemPtr->locale, u_errorName(status) ); 2177 } 2178 ucol_close(ucol); 2179 } else { 2180 log_data_err("error, locale %s, ucol_open failed: %s\n", tsceItemPtr->locale, u_errorName(status) ); 2181 } 2182 } 2183 } 2184 2185 #endif /* #if !UCONFIG_NO_COLLATION */ 2186