1 /******************************************************************** 2 * Copyright (c) 1999-2009, International Business Machines 3 * Corporation and others. All Rights Reserved. 4 ******************************************************************** 5 * Date Name Description 6 * 12/14/99 Madhu Creation. 7 * 01/12/2000 Madhu updated for changed API 8 ********************************************************************/ 9 10 #include "unicode/utypes.h" 11 12 #if !UCONFIG_NO_BREAK_ITERATION 13 14 #include "unicode/uchar.h" 15 #include "intltest.h" 16 #include "unicode/rbbi.h" 17 #include "unicode/schriter.h" 18 #include "rbbiapts.h" 19 #include "rbbidata.h" 20 #include "cstring.h" 21 #include "ubrkimpl.h" 22 #include "unicode/ustring.h" 23 #include "unicode/utext.h" 24 #include "cmemory.h" 25 26 /** 27 * API Test the RuleBasedBreakIterator class 28 */ 29 30 31 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) {\ 32 errln("Failure at file %s, line %d, error = %s", __FILE__, __LINE__, u_errorName(status));}} 33 34 #define TEST_ASSERT(expr) {if ((expr)==FALSE) { \ 35 errln("Test Failure at file %s, line %d", __FILE__, __LINE__);}} 36 37 void RBBIAPITest::TestCloneEquals() 38 { 39 40 UErrorCode status=U_ZERO_ERROR; 41 RuleBasedBreakIterator* bi1 = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status); 42 RuleBasedBreakIterator* biequal = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status); 43 RuleBasedBreakIterator* bi3 = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status); 44 RuleBasedBreakIterator* bi2 = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createWordInstance(Locale::getDefault(), status); 45 if(U_FAILURE(status)){ 46 errcheckln(status, "Fail : in construction - %s", u_errorName(status)); 47 return; 48 } 49 50 51 UnicodeString testString="Testing word break iterators's clone() and equals()"; 52 bi1->setText(testString); 53 bi2->setText(testString); 54 biequal->setText(testString); 55 56 bi3->setText("hello"); 57 58 logln((UnicodeString)"Testing equals()"); 59 60 logln((UnicodeString)"Testing == and !="); 61 UBool b = (*bi1 != *biequal); 62 b |= *bi1 == *bi2; 63 b |= *bi1 == *bi3; 64 if (b) { 65 errln((UnicodeString)"ERROR:1 RBBI's == and != operator failed."); 66 } 67 68 if(*bi2 == *biequal || *bi2 == *bi1 || *biequal == *bi3) 69 errln((UnicodeString)"ERROR:2 RBBI's == and != operator failed."); 70 71 72 // Quick test of RulesBasedBreakIterator assignment - 73 // Check that 74 // two different iterators are != 75 // they are == after assignment 76 // source and dest iterator produce the same next() after assignment. 77 // deleting one doesn't disable the other. 78 logln("Testing assignment"); 79 RuleBasedBreakIterator *bix = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); 80 if(U_FAILURE(status)){ 81 errcheckln(status, "Fail : in construction - %s", u_errorName(status)); 82 return; 83 } 84 85 RuleBasedBreakIterator biDefault, biDefault2; 86 if(U_FAILURE(status)){ 87 errln((UnicodeString)"FAIL : in construction of default iterator"); 88 return; 89 } 90 if (biDefault == *bix) { 91 errln((UnicodeString)"ERROR: iterators should not compare =="); 92 return; 93 } 94 if (biDefault != biDefault2) { 95 errln((UnicodeString)"ERROR: iterators should compare =="); 96 return; 97 } 98 99 100 UnicodeString HelloString("Hello Kitty"); 101 bix->setText(HelloString); 102 if (*bix == *bi2) { 103 errln(UnicodeString("ERROR: strings should not be equal before assignment.")); 104 } 105 *bix = *bi2; 106 if (*bix != *bi2) { 107 errln(UnicodeString("ERROR: strings should be equal before assignment.")); 108 } 109 110 int bixnext = bix->next(); 111 int bi2next = bi2->next(); 112 if (! (bixnext == bi2next && bixnext == 7)) { 113 errln(UnicodeString("ERROR: iterators behaved differently after assignment.")); 114 } 115 delete bix; 116 if (bi2->next() != 8) { 117 errln(UnicodeString("ERROR: iterator.next() failed after deleting copy.")); 118 } 119 120 121 122 logln((UnicodeString)"Testing clone()"); 123 RuleBasedBreakIterator* bi1clone=(RuleBasedBreakIterator*)bi1->clone(); 124 RuleBasedBreakIterator* bi2clone=(RuleBasedBreakIterator*)bi2->clone(); 125 126 if(*bi1clone != *bi1 || *bi1clone != *biequal || 127 *bi1clone == *bi3 || *bi1clone == *bi2) 128 errln((UnicodeString)"ERROR:1 RBBI's clone() method failed"); 129 130 if(*bi2clone == *bi1 || *bi2clone == *biequal || 131 *bi2clone == *bi3 || *bi2clone != *bi2) 132 errln((UnicodeString)"ERROR:2 RBBI's clone() method failed"); 133 134 if(bi1->getText() != bi1clone->getText() || 135 bi2clone->getText() != bi2->getText() || 136 *bi2clone == *bi1clone ) 137 errln((UnicodeString)"ERROR: RBBI's clone() method failed"); 138 139 delete bi1clone; 140 delete bi2clone; 141 delete bi1; 142 delete bi3; 143 delete bi2; 144 delete biequal; 145 } 146 147 void RBBIAPITest::TestBoilerPlate() 148 { 149 UErrorCode status = U_ZERO_ERROR; 150 BreakIterator* a = BreakIterator::createWordInstance(Locale("hi"), status); 151 BreakIterator* b = BreakIterator::createWordInstance(Locale("hi_IN"),status); 152 if (U_FAILURE(status)) { 153 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 154 return; 155 } 156 if(*a!=*b){ 157 errln("Failed: boilerplate method operator!= does not return correct results"); 158 } 159 BreakIterator* c = BreakIterator::createWordInstance(Locale("ja"),status); 160 if(a && c){ 161 if(*c==*a){ 162 errln("Failed: boilerplate method opertator== does not return correct results"); 163 } 164 }else{ 165 errln("creation of break iterator failed"); 166 } 167 delete a; 168 delete b; 169 delete c; 170 } 171 172 void RBBIAPITest::TestgetRules() 173 { 174 UErrorCode status=U_ZERO_ERROR; 175 176 RuleBasedBreakIterator* bi1=(RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status); 177 RuleBasedBreakIterator* bi2=(RuleBasedBreakIterator*)RuleBasedBreakIterator::createWordInstance(Locale::getDefault(), status); 178 if(U_FAILURE(status)){ 179 errcheckln(status, "FAIL: in construction - %s", u_errorName(status)); 180 delete bi1; 181 delete bi2; 182 return; 183 } 184 185 186 187 logln((UnicodeString)"Testing toString()"); 188 189 bi1->setText((UnicodeString)"Hello there"); 190 191 RuleBasedBreakIterator* bi3 =(RuleBasedBreakIterator*)bi1->clone(); 192 193 UnicodeString temp=bi1->getRules(); 194 UnicodeString temp2=bi2->getRules(); 195 UnicodeString temp3=bi3->getRules(); 196 if( temp2.compare(temp3) ==0 || temp.compare(temp2) == 0 || temp.compare(temp3) != 0) 197 errln((UnicodeString)"ERROR: error in getRules() method"); 198 199 delete bi1; 200 delete bi2; 201 delete bi3; 202 } 203 void RBBIAPITest::TestHashCode() 204 { 205 UErrorCode status=U_ZERO_ERROR; 206 RuleBasedBreakIterator* bi1 = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status); 207 RuleBasedBreakIterator* bi3 = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status); 208 RuleBasedBreakIterator* bi2 = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createWordInstance(Locale::getDefault(), status); 209 if(U_FAILURE(status)){ 210 errcheckln(status, "Fail : in construction - %s", u_errorName(status)); 211 delete bi1; 212 delete bi2; 213 delete bi3; 214 return; 215 } 216 217 218 logln((UnicodeString)"Testing hashCode()"); 219 220 bi1->setText((UnicodeString)"Hash code"); 221 bi2->setText((UnicodeString)"Hash code"); 222 bi3->setText((UnicodeString)"Hash code"); 223 224 RuleBasedBreakIterator* bi1clone= (RuleBasedBreakIterator*)bi1->clone(); 225 RuleBasedBreakIterator* bi2clone= (RuleBasedBreakIterator*)bi2->clone(); 226 227 if(bi1->hashCode() != bi1clone->hashCode() || bi1->hashCode() != bi3->hashCode() || 228 bi1clone->hashCode() != bi3->hashCode() || bi2->hashCode() != bi2clone->hashCode()) 229 errln((UnicodeString)"ERROR: identical objects have different hashcodes"); 230 231 if(bi1->hashCode() == bi2->hashCode() || bi2->hashCode() == bi3->hashCode() || 232 bi1clone->hashCode() == bi2clone->hashCode() || bi1clone->hashCode() == bi2->hashCode()) 233 errln((UnicodeString)"ERROR: different objects have same hashcodes"); 234 235 delete bi1clone; 236 delete bi2clone; 237 delete bi1; 238 delete bi2; 239 delete bi3; 240 241 } 242 void RBBIAPITest::TestGetSetAdoptText() 243 { 244 logln((UnicodeString)"Testing getText setText "); 245 UErrorCode status=U_ZERO_ERROR; 246 UnicodeString str1="first string."; 247 UnicodeString str2="Second string."; 248 RuleBasedBreakIterator* charIter1 = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status); 249 RuleBasedBreakIterator* wordIter1 = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createWordInstance(Locale::getDefault(), status); 250 if(U_FAILURE(status)){ 251 errcheckln(status, "Fail : in construction - %s", u_errorName(status)); 252 return; 253 } 254 255 256 CharacterIterator* text1= new StringCharacterIterator(str1); 257 CharacterIterator* text1Clone = text1->clone(); 258 CharacterIterator* text2= new StringCharacterIterator(str2); 259 CharacterIterator* text3= new StringCharacterIterator(str2, 3, 10, 3); // "ond str" 260 261 wordIter1->setText(str1); 262 CharacterIterator *tci = &wordIter1->getText(); 263 UnicodeString tstr; 264 tci->getText(tstr); 265 TEST_ASSERT(tstr == str1); 266 if(wordIter1->current() != 0) 267 errln((UnicodeString)"ERROR:1 setText did not set the iteration position to the beginning of the text, it is" + wordIter1->current() + (UnicodeString)"\n"); 268 269 wordIter1->next(2); 270 271 wordIter1->setText(str2); 272 if(wordIter1->current() != 0) 273 errln((UnicodeString)"ERROR:2 setText did not reset the iteration position to the beginning of the text, it is" + wordIter1->current() + (UnicodeString)"\n"); 274 275 276 charIter1->adoptText(text1Clone); 277 TEST_ASSERT(wordIter1->getText() != charIter1->getText()); 278 tci = &wordIter1->getText(); 279 tci->getText(tstr); 280 TEST_ASSERT(tstr == str2); 281 tci = &charIter1->getText(); 282 tci->getText(tstr); 283 TEST_ASSERT(tstr == str1); 284 285 286 RuleBasedBreakIterator* rb=(RuleBasedBreakIterator*)wordIter1->clone(); 287 rb->adoptText(text1); 288 if(rb->getText() != *text1) 289 errln((UnicodeString)"ERROR:1 error in adoptText "); 290 rb->adoptText(text2); 291 if(rb->getText() != *text2) 292 errln((UnicodeString)"ERROR:2 error in adoptText "); 293 294 // Adopt where iterator range is less than the entire orignal source string. 295 // (With the change of the break engine to working with UText internally, 296 // CharacterIterators starting at positions other than zero are not supported) 297 rb->adoptText(text3); 298 TEST_ASSERT(rb->preceding(2) == 0); 299 TEST_ASSERT(rb->following(11) == BreakIterator::DONE); 300 //if(rb->preceding(2) != 3) { 301 // errln((UnicodeString)"ERROR:3 error in adoptText "); 302 //} 303 //if(rb->following(11) != BreakIterator::DONE) { 304 // errln((UnicodeString)"ERROR:4 error in adoptText "); 305 //} 306 307 // UText API 308 // 309 // Quick test to see if UText is working at all. 310 // 311 const char *s1 = "\x68\x65\x6C\x6C\x6F\x20\x77\x6F\x72\x6C\x64"; /* "hello world" in UTF-8 */ 312 const char *s2 = "\x73\x65\x65\x20\x79\x61"; /* "see ya" in UTF-8 */ 313 // 012345678901 314 315 status = U_ZERO_ERROR; 316 UText *ut = utext_openUTF8(NULL, s1, -1, &status); 317 wordIter1->setText(ut, status); 318 TEST_ASSERT_SUCCESS(status); 319 320 int32_t pos; 321 pos = wordIter1->first(); 322 TEST_ASSERT(pos==0); 323 pos = wordIter1->next(); 324 TEST_ASSERT(pos==5); 325 pos = wordIter1->next(); 326 TEST_ASSERT(pos==6); 327 pos = wordIter1->next(); 328 TEST_ASSERT(pos==11); 329 pos = wordIter1->next(); 330 TEST_ASSERT(pos==UBRK_DONE); 331 332 status = U_ZERO_ERROR; 333 UText *ut2 = utext_openUTF8(NULL, s2, -1, &status); 334 TEST_ASSERT_SUCCESS(status); 335 wordIter1->setText(ut2, status); 336 TEST_ASSERT_SUCCESS(status); 337 338 pos = wordIter1->first(); 339 TEST_ASSERT(pos==0); 340 pos = wordIter1->next(); 341 TEST_ASSERT(pos==3); 342 pos = wordIter1->next(); 343 TEST_ASSERT(pos==4); 344 345 pos = wordIter1->last(); 346 TEST_ASSERT(pos==6); 347 pos = wordIter1->previous(); 348 TEST_ASSERT(pos==4); 349 pos = wordIter1->previous(); 350 TEST_ASSERT(pos==3); 351 pos = wordIter1->previous(); 352 TEST_ASSERT(pos==0); 353 pos = wordIter1->previous(); 354 TEST_ASSERT(pos==UBRK_DONE); 355 356 status = U_ZERO_ERROR; 357 UnicodeString sEmpty; 358 UText *gut2 = utext_openUnicodeString(NULL, &sEmpty, &status); 359 wordIter1->getUText(gut2, status); 360 TEST_ASSERT_SUCCESS(status); 361 utext_close(gut2); 362 363 utext_close(ut); 364 utext_close(ut2); 365 366 delete wordIter1; 367 delete charIter1; 368 delete rb; 369 370 } 371 372 373 void RBBIAPITest::TestIteration() 374 { 375 // This test just verifies that the API is present. 376 // Testing for correct operation of the break rules happens elsewhere. 377 378 UErrorCode status=U_ZERO_ERROR; 379 RuleBasedBreakIterator* bi = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status); 380 if (U_FAILURE(status) || bi == NULL) { 381 errcheckln(status, "Failure creating character break iterator. Status = %s", u_errorName(status)); 382 } 383 delete bi; 384 385 status=U_ZERO_ERROR; 386 bi = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createWordInstance(Locale::getDefault(), status); 387 if (U_FAILURE(status) || bi == NULL) { 388 errcheckln(status, "Failure creating Word break iterator. Status = %s", u_errorName(status)); 389 } 390 delete bi; 391 392 status=U_ZERO_ERROR; 393 bi = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createLineInstance(Locale::getDefault(), status); 394 if (U_FAILURE(status) || bi == NULL) { 395 errcheckln(status, "Failure creating Line break iterator. Status = %s", u_errorName(status)); 396 } 397 delete bi; 398 399 status=U_ZERO_ERROR; 400 bi = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createSentenceInstance(Locale::getDefault(), status); 401 if (U_FAILURE(status) || bi == NULL) { 402 errcheckln(status, "Failure creating Sentence break iterator. Status = %s", u_errorName(status)); 403 } 404 delete bi; 405 406 status=U_ZERO_ERROR; 407 bi = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createTitleInstance(Locale::getDefault(), status); 408 if (U_FAILURE(status) || bi == NULL) { 409 errcheckln(status, "Failure creating Title break iterator. Status = %s", u_errorName(status)); 410 } 411 delete bi; 412 413 status=U_ZERO_ERROR; 414 bi = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status); 415 if (U_FAILURE(status) || bi == NULL) { 416 errcheckln(status, "Failure creating character break iterator. Status = %s", u_errorName(status)); 417 return; // Skip the rest of these tests. 418 } 419 420 421 UnicodeString testString="0123456789"; 422 bi->setText(testString); 423 424 int32_t i; 425 i = bi->first(); 426 if (i != 0) { 427 errln("Incorrect value from bi->first(). Expected 0, got %d.", i); 428 } 429 430 i = bi->last(); 431 if (i != 10) { 432 errln("Incorrect value from bi->last(). Expected 10, got %d", i); 433 } 434 435 // 436 // Previous 437 // 438 bi->last(); 439 i = bi->previous(); 440 if (i != 9) { 441 errln("Incorrect value from bi->last() at line %d. Expected 9, got %d", __LINE__, i); 442 } 443 444 445 bi->first(); 446 i = bi->previous(); 447 if (i != BreakIterator::DONE) { 448 errln("Incorrect value from bi->previous() at line %d. Expected DONE, got %d", __LINE__, i); 449 } 450 451 // 452 // next() 453 // 454 bi->first(); 455 i = bi->next(); 456 if (i != 1) { 457 errln("Incorrect value from bi->next() at line %d. Expected 1, got %d", __LINE__, i); 458 } 459 460 bi->last(); 461 i = bi->next(); 462 if (i != BreakIterator::DONE) { 463 errln("Incorrect value from bi->next() at line %d. Expected DONE, got %d", __LINE__, i); 464 } 465 466 467 // 468 // current() 469 // 470 bi->first(); 471 i = bi->current(); 472 if (i != 0) { 473 errln("Incorrect value from bi->previous() at line %d. Expected 0, got %d", __LINE__, i); 474 } 475 476 bi->next(); 477 i = bi->current(); 478 if (i != 1) { 479 errln("Incorrect value from bi->previous() at line %d. Expected 1, got %d", __LINE__, i); 480 } 481 482 bi->last(); 483 bi->next(); 484 i = bi->current(); 485 if (i != 10) { 486 errln("Incorrect value from bi->previous() at line %d. Expected 10, got %d", __LINE__, i); 487 } 488 489 bi->first(); 490 bi->previous(); 491 i = bi->current(); 492 if (i != 0) { 493 errln("Incorrect value from bi->previous() at line %d. Expected 0, got %d", __LINE__, i); 494 } 495 496 497 // 498 // Following() 499 // 500 i = bi->following(4); 501 if (i != 5) { 502 errln("Incorrect value from bi->following() at line %d. Expected 5, got %d", __LINE__, i); 503 } 504 505 i = bi->following(9); 506 if (i != 10) { 507 errln("Incorrect value from bi->following() at line %d. Expected 10, got %d", __LINE__, i); 508 } 509 510 i = bi->following(10); 511 if (i != BreakIterator::DONE) { 512 errln("Incorrect value from bi->following() at line %d. Expected DONE, got %d", __LINE__, i); 513 } 514 515 516 // 517 // Preceding 518 // 519 i = bi->preceding(4); 520 if (i != 3) { 521 errln("Incorrect value from bi->preceding() at line %d. Expected 3, got %d", __LINE__, i); 522 } 523 524 i = bi->preceding(10); 525 if (i != 9) { 526 errln("Incorrect value from bi->preceding() at line %d. Expected 9, got %d", __LINE__, i); 527 } 528 529 i = bi->preceding(1); 530 if (i != 0) { 531 errln("Incorrect value from bi->preceding() at line %d. Expected 0, got %d", __LINE__, i); 532 } 533 534 i = bi->preceding(0); 535 if (i != BreakIterator::DONE) { 536 errln("Incorrect value from bi->preceding() at line %d. Expected DONE, got %d", __LINE__, i); 537 } 538 539 540 // 541 // isBoundary() 542 // 543 bi->first(); 544 if (bi->isBoundary(3) != TRUE) { 545 errln("Incorrect value from bi->isBoudary() at line %d. Expected TRUE, got FALSE", __LINE__, i); 546 } 547 i = bi->current(); 548 if (i != 3) { 549 errln("Incorrect value from bi->current() at line %d. Expected 3, got %d", __LINE__, i); 550 } 551 552 553 if (bi->isBoundary(11) != FALSE) { 554 errln("Incorrect value from bi->isBoudary() at line %d. Expected FALSE, got TRUE", __LINE__, i); 555 } 556 i = bi->current(); 557 if (i != 10) { 558 errln("Incorrect value from bi->current() at line %d. Expected 10, got %d", __LINE__, i); 559 } 560 561 // 562 // next(n) 563 // 564 bi->first(); 565 i = bi->next(4); 566 if (i != 4) { 567 errln("Incorrect value from bi->next() at line %d. Expected 4, got %d", __LINE__, i); 568 } 569 570 i = bi->next(6); 571 if (i != 10) { 572 errln("Incorrect value from bi->next() at line %d. Expected 10, got %d", __LINE__, i); 573 } 574 575 bi->first(); 576 i = bi->next(11); 577 if (i != BreakIterator::DONE) { 578 errln("Incorrect value from bi->next() at line %d. Expected BreakIterator::DONE, got %d", __LINE__, i); 579 } 580 581 delete bi; 582 583 } 584 585 586 587 588 589 590 void RBBIAPITest::TestBuilder() { 591 UnicodeString rulesString1 = "$Letters = [:L:];\n" 592 "$Numbers = [:N:];\n" 593 "$Letters+;\n" 594 "$Numbers+;\n" 595 "[^$Letters $Numbers];\n" 596 "!.*;\n"; 597 UnicodeString testString1 = "abc123..abc"; 598 // 01234567890 599 int32_t bounds1[] = {0, 3, 6, 7, 8, 11}; 600 UErrorCode status=U_ZERO_ERROR; 601 UParseError parseError; 602 603 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status); 604 if(U_FAILURE(status)) { 605 dataerrln("Fail : in construction - %s", u_errorName(status)); 606 } else { 607 bi->setText(testString1); 608 doBoundaryTest(*bi, testString1, bounds1); 609 } 610 delete bi; 611 } 612 613 614 // 615 // TestQuoteGrouping 616 // Single quotes within rules imply a grouping, so that a modifier 617 // following the quoted text (* or +) applies to all of the quoted chars. 618 // 619 void RBBIAPITest::TestQuoteGrouping() { 620 UnicodeString rulesString1 = "#Here comes the rule...\n" 621 "'$@!'*;\n" // (\$\@\!)* 622 ".;\n"; 623 624 UnicodeString testString1 = "$@!$@!X$@!!X"; 625 // 0123456789012 626 int32_t bounds1[] = {0, 6, 7, 10, 11, 12}; 627 UErrorCode status=U_ZERO_ERROR; 628 UParseError parseError; 629 630 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status); 631 if(U_FAILURE(status)) { 632 dataerrln("Fail : in construction - %s", u_errorName(status)); 633 } else { 634 bi->setText(testString1); 635 doBoundaryTest(*bi, testString1, bounds1); 636 } 637 delete bi; 638 } 639 640 // 641 // TestRuleStatus 642 // Test word break rule status constants. 643 // 644 void RBBIAPITest::TestRuleStatus() { 645 UChar str[30]; 646 u_unescape("plain word 123.45 \\u9160\\u9161 \\u30a1\\u30a2 \\u3041\\u3094", 647 // 012345678901234567 8 9 0 1 2 3 4 5 6 648 // Ideographic Katakana Hiragana 649 str, 30); 650 UnicodeString testString1(str); 651 int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 19, 20, 21, 23, 24, 25, 26}; 652 int32_t tag_lo[] = {UBRK_WORD_NONE, UBRK_WORD_LETTER, UBRK_WORD_NONE, UBRK_WORD_LETTER, 653 UBRK_WORD_NONE, UBRK_WORD_NUMBER, UBRK_WORD_NONE, 654 UBRK_WORD_IDEO, UBRK_WORD_IDEO, UBRK_WORD_NONE, 655 UBRK_WORD_KANA, UBRK_WORD_NONE, UBRK_WORD_KANA, UBRK_WORD_KANA}; 656 657 int32_t tag_hi[] = {UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, 658 UBRK_WORD_NONE_LIMIT, UBRK_WORD_NUMBER_LIMIT, UBRK_WORD_NONE_LIMIT, 659 UBRK_WORD_IDEO_LIMIT, UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT, 660 UBRK_WORD_KANA_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_KANA_LIMIT, UBRK_WORD_KANA_LIMIT}; 661 662 UErrorCode status=U_ZERO_ERROR; 663 664 RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status); 665 if(U_FAILURE(status)) { 666 errcheckln(status, "Fail : in construction - %s", u_errorName(status)); 667 } else { 668 bi->setText(testString1); 669 // First test that the breaks are in the right spots. 670 doBoundaryTest(*bi, testString1, bounds1); 671 672 // Then go back and check tag values 673 int32_t i = 0; 674 int32_t pos, tag; 675 for (pos = bi->first(); pos != BreakIterator::DONE; pos = bi->next(), i++) { 676 if (pos != bounds1[i]) { 677 errln("FAIL: unexpected word break at postion %d", pos); 678 break; 679 } 680 tag = bi->getRuleStatus(); 681 if (tag < tag_lo[i] || tag >= tag_hi[i]) { 682 errln("FAIL: incorrect tag value %d at position %d", tag, pos); 683 break; 684 } 685 686 // Check that we get the same tag values from getRuleStatusVec() 687 int32_t vec[10]; 688 int t = bi->getRuleStatusVec(vec, 10, status); 689 TEST_ASSERT_SUCCESS(status); 690 TEST_ASSERT(t==1); 691 TEST_ASSERT(vec[0] == tag); 692 } 693 } 694 delete bi; 695 696 // Now test line break status. This test mostly is to confirm that the status constants 697 // are correctly declared in the header. 698 testString1 = "test line. \n"; 699 // break type s s h 700 701 bi = (RuleBasedBreakIterator *) 702 BreakIterator::createLineInstance(Locale::getEnglish(), status); 703 if(U_FAILURE(status)) { 704 errcheckln(status, "failed to create word break iterator. - %s", u_errorName(status)); 705 } else { 706 int32_t i = 0; 707 int32_t pos, tag; 708 UBool success; 709 710 bi->setText(testString1); 711 pos = bi->current(); 712 tag = bi->getRuleStatus(); 713 for (i=0; i<3; i++) { 714 switch (i) { 715 case 0: 716 success = pos==0 && tag==UBRK_LINE_SOFT; break; 717 case 1: 718 success = pos==5 && tag==UBRK_LINE_SOFT; break; 719 case 2: 720 success = pos==12 && tag==UBRK_LINE_HARD; break; 721 default: 722 success = FALSE; break; 723 } 724 if (success == FALSE) { 725 errln("Fail: incorrect word break status or position. i=%d, pos=%d, tag=%d", 726 i, pos, tag); 727 break; 728 } 729 pos = bi->next(); 730 tag = bi->getRuleStatus(); 731 } 732 if (UBRK_LINE_SOFT >= UBRK_LINE_SOFT_LIMIT || 733 UBRK_LINE_HARD >= UBRK_LINE_HARD_LIMIT || 734 UBRK_LINE_HARD > UBRK_LINE_SOFT && UBRK_LINE_HARD < UBRK_LINE_SOFT_LIMIT ) { 735 errln("UBRK_LINE_* constants from header are inconsistent."); 736 } 737 } 738 delete bi; 739 740 } 741 742 743 // 744 // TestRuleStatusVec 745 // Test the vector form of break rule status. 746 // 747 void RBBIAPITest::TestRuleStatusVec() { 748 UnicodeString rulesString( "[A-N]{100}; \n" 749 "[a-w]{200}; \n" 750 "[\\p{L}]{300}; \n" 751 "[\\p{N}]{400}; \n" 752 "[0-5]{500}; \n" 753 "!.*;\n", -1, US_INV); 754 UnicodeString testString1 = "Aapz5?"; 755 int32_t statusVals[10]; 756 int32_t numStatuses; 757 int32_t pos; 758 759 UErrorCode status=U_ZERO_ERROR; 760 UParseError parseError; 761 762 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString, parseError, status); 763 if (U_FAILURE(status)) { 764 dataerrln("Failure at file %s, line %d, error = %s", __FILE__, __LINE__, u_errorName(status)); 765 } else { 766 bi->setText(testString1); 767 768 // A 769 pos = bi->next(); 770 TEST_ASSERT(pos==1); 771 numStatuses = bi->getRuleStatusVec(statusVals, 10, status); 772 TEST_ASSERT_SUCCESS(status); 773 TEST_ASSERT(numStatuses == 2); 774 TEST_ASSERT(statusVals[0] == 100); 775 TEST_ASSERT(statusVals[1] == 300); 776 777 // a 778 pos = bi->next(); 779 TEST_ASSERT(pos==2); 780 numStatuses = bi->getRuleStatusVec(statusVals, 10, status); 781 TEST_ASSERT_SUCCESS(status); 782 TEST_ASSERT(numStatuses == 2); 783 TEST_ASSERT(statusVals[0] == 200); 784 TEST_ASSERT(statusVals[1] == 300); 785 786 // p 787 pos = bi->next(); 788 TEST_ASSERT(pos==3); 789 numStatuses = bi->getRuleStatusVec(statusVals, 10, status); 790 TEST_ASSERT_SUCCESS(status); 791 TEST_ASSERT(numStatuses == 2); 792 TEST_ASSERT(statusVals[0] == 200); 793 TEST_ASSERT(statusVals[1] == 300); 794 795 // z 796 pos = bi->next(); 797 TEST_ASSERT(pos==4); 798 numStatuses = bi->getRuleStatusVec(statusVals, 10, status); 799 TEST_ASSERT_SUCCESS(status); 800 TEST_ASSERT(numStatuses == 1); 801 TEST_ASSERT(statusVals[0] == 300); 802 803 // 5 804 pos = bi->next(); 805 TEST_ASSERT(pos==5); 806 numStatuses = bi->getRuleStatusVec(statusVals, 10, status); 807 TEST_ASSERT_SUCCESS(status); 808 TEST_ASSERT(numStatuses == 2); 809 TEST_ASSERT(statusVals[0] == 400); 810 TEST_ASSERT(statusVals[1] == 500); 811 812 // ? 813 pos = bi->next(); 814 TEST_ASSERT(pos==6); 815 numStatuses = bi->getRuleStatusVec(statusVals, 10, status); 816 TEST_ASSERT_SUCCESS(status); 817 TEST_ASSERT(numStatuses == 1); 818 TEST_ASSERT(statusVals[0] == 0); 819 820 // 821 // Check buffer overflow error handling. Char == A 822 // 823 bi->first(); 824 pos = bi->next(); 825 TEST_ASSERT(pos==1); 826 memset(statusVals, -1, sizeof(statusVals)); 827 numStatuses = bi->getRuleStatusVec(statusVals, 0, status); 828 TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR); 829 TEST_ASSERT(numStatuses == 2); 830 TEST_ASSERT(statusVals[0] == -1); 831 832 status = U_ZERO_ERROR; 833 memset(statusVals, -1, sizeof(statusVals)); 834 numStatuses = bi->getRuleStatusVec(statusVals, 1, status); 835 TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR); 836 TEST_ASSERT(numStatuses == 2); 837 TEST_ASSERT(statusVals[0] == 100); 838 TEST_ASSERT(statusVals[1] == -1); 839 840 status = U_ZERO_ERROR; 841 memset(statusVals, -1, sizeof(statusVals)); 842 numStatuses = bi->getRuleStatusVec(statusVals, 2, status); 843 TEST_ASSERT_SUCCESS(status); 844 TEST_ASSERT(numStatuses == 2); 845 TEST_ASSERT(statusVals[0] == 100); 846 TEST_ASSERT(statusVals[1] == 300); 847 TEST_ASSERT(statusVals[2] == -1); 848 } 849 delete bi; 850 851 } 852 853 // 854 // Bug 2190 Regression test. Builder crash on rule consisting of only a 855 // $variable reference 856 void RBBIAPITest::TestBug2190() { 857 UnicodeString rulesString1 = "$aaa = abcd;\n" 858 "$bbb = $aaa;\n" 859 "$bbb;\n"; 860 UnicodeString testString1 = "abcdabcd"; 861 // 01234567890 862 int32_t bounds1[] = {0, 4, 8}; 863 UErrorCode status=U_ZERO_ERROR; 864 UParseError parseError; 865 866 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status); 867 if(U_FAILURE(status)) { 868 dataerrln("Fail : in construction - %s", u_errorName(status)); 869 } else { 870 bi->setText(testString1); 871 doBoundaryTest(*bi, testString1, bounds1); 872 } 873 delete bi; 874 } 875 876 877 void RBBIAPITest::TestRegistration() { 878 #if !UCONFIG_NO_SERVICE 879 UErrorCode status = U_ZERO_ERROR; 880 BreakIterator* ja_word = BreakIterator::createWordInstance("ja_JP", status); 881 882 // ok to not delete these if we exit because of error? 883 BreakIterator* ja_char = BreakIterator::createCharacterInstance("ja_JP", status); 884 BreakIterator* root_word = BreakIterator::createWordInstance("", status); 885 BreakIterator* root_char = BreakIterator::createCharacterInstance("", status); 886 887 if (status == U_MISSING_RESOURCE_ERROR || status == U_FILE_ACCESS_ERROR) { 888 dataerrln("Error creating instances of break interactors - %s", u_errorName(status)); 889 delete ja_word; 890 delete ja_char; 891 delete root_word; 892 delete root_char; 893 894 return; 895 } 896 897 URegistryKey key = BreakIterator::registerInstance(ja_word, "xx", UBRK_WORD, status); 898 { 899 if (ja_word && *ja_word == *root_word) { 900 errln("japan not different from root"); 901 } 902 } 903 904 { 905 BreakIterator* result = BreakIterator::createWordInstance("xx_XX", status); 906 UBool fail = TRUE; 907 if(result){ 908 fail = *result != *ja_word; 909 } 910 delete result; 911 if (fail) { 912 errln("bad result for xx_XX/word"); 913 } 914 } 915 916 { 917 BreakIterator* result = BreakIterator::createCharacterInstance("ja_JP", status); 918 UBool fail = TRUE; 919 if(result){ 920 fail = *result != *ja_char; 921 } 922 delete result; 923 if (fail) { 924 errln("bad result for ja_JP/char"); 925 } 926 } 927 928 { 929 BreakIterator* result = BreakIterator::createCharacterInstance("xx_XX", status); 930 UBool fail = TRUE; 931 if(result){ 932 fail = *result != *root_char; 933 } 934 delete result; 935 if (fail) { 936 errln("bad result for xx_XX/char"); 937 } 938 } 939 940 { 941 StringEnumeration* avail = BreakIterator::getAvailableLocales(); 942 UBool found = FALSE; 943 const UnicodeString* p; 944 while ((p = avail->snext(status))) { 945 if (p->compare("xx") == 0) { 946 found = TRUE; 947 break; 948 } 949 } 950 delete avail; 951 if (!found) { 952 errln("did not find test locale"); 953 } 954 } 955 956 { 957 UBool unreg = BreakIterator::unregister(key, status); 958 if (!unreg) { 959 errln("unable to unregister"); 960 } 961 } 962 963 { 964 BreakIterator* result = BreakIterator::createWordInstance("en_US", status); 965 BreakIterator* root = BreakIterator::createWordInstance("", status); 966 UBool fail = TRUE; 967 if(root){ 968 fail = *root != *result; 969 } 970 delete root; 971 delete result; 972 if (fail) { 973 errln("did not get root break"); 974 } 975 } 976 977 { 978 StringEnumeration* avail = BreakIterator::getAvailableLocales(); 979 UBool found = FALSE; 980 const UnicodeString* p; 981 while ((p = avail->snext(status))) { 982 if (p->compare("xx") == 0) { 983 found = TRUE; 984 break; 985 } 986 } 987 delete avail; 988 if (found) { 989 errln("found test locale"); 990 } 991 } 992 993 { 994 int32_t count; 995 UBool foundLocale = FALSE; 996 const Locale *avail = BreakIterator::getAvailableLocales(count); 997 for (int i=0; i<count; i++) { 998 if (avail[i] == Locale::getEnglish()) { 999 foundLocale = TRUE; 1000 break; 1001 } 1002 } 1003 if (foundLocale == FALSE) { 1004 errln("BreakIterator::getAvailableLocales(&count), failed to find EN."); 1005 } 1006 } 1007 1008 1009 // ja_word was adopted by factory 1010 delete ja_char; 1011 delete root_word; 1012 delete root_char; 1013 #endif 1014 } 1015 1016 void RBBIAPITest::RoundtripRule(const char *dataFile) { 1017 UErrorCode status = U_ZERO_ERROR; 1018 UParseError parseError; 1019 parseError.line = 0; 1020 parseError.offset = 0; 1021 UDataMemory *data = udata_open(U_ICUDATA_BRKITR, "brk", dataFile, &status); 1022 uint32_t length; 1023 const UChar *builtSource; 1024 const uint8_t *rbbiRules; 1025 const uint8_t *builtRules; 1026 1027 if (U_FAILURE(status)) { 1028 errcheckln(status, "Can't open \"%s\" - %s", dataFile, u_errorName(status)); 1029 return; 1030 } 1031 1032 builtRules = (const uint8_t *)udata_getMemory(data); 1033 builtSource = (const UChar *)(builtRules + ((RBBIDataHeader*)builtRules)->fRuleSource); 1034 RuleBasedBreakIterator *brkItr = new RuleBasedBreakIterator(builtSource, parseError, status); 1035 if (U_FAILURE(status)) { 1036 errln("createRuleBasedBreakIterator: ICU Error \"%s\" at line %d, column %d\n", 1037 u_errorName(status), parseError.line, parseError.offset); 1038 return; 1039 }; 1040 rbbiRules = brkItr->getBinaryRules(length); 1041 logln("Comparing \"%s\" len=%d", dataFile, length); 1042 if (memcmp(builtRules, rbbiRules, (int32_t)length) != 0) { 1043 errln("Built rules and rebuilt rules are different %s", dataFile); 1044 return; 1045 } 1046 delete brkItr; 1047 udata_close(data); 1048 } 1049 1050 void RBBIAPITest::TestRoundtripRules() { 1051 RoundtripRule("word"); 1052 RoundtripRule("title"); 1053 RoundtripRule("sent"); 1054 RoundtripRule("line"); 1055 RoundtripRule("char"); 1056 if (!quick) { 1057 RoundtripRule("word_ja"); 1058 RoundtripRule("word_POSIX"); 1059 } 1060 } 1061 1062 // Try out the RuleBasedBreakIterator constructors that take RBBIDataHeader* 1063 // (these are protected so we access them via a local class RBBIWithProtectedFunctions). 1064 // This is just a sanity check, not a thorough test (e.g. we don't check that the 1065 // first delete actually frees rulesCopy). 1066 void RBBIAPITest::TestCreateFromRBBIData() { 1067 // Get some handy RBBIData 1068 const char *brkName = "word"; // or "sent", "line", "char", etc. 1069 UErrorCode status = U_ZERO_ERROR; 1070 UDataMemory * data = udata_open(U_ICUDATA_BRKITR, "brk", brkName, &status); 1071 if ( U_SUCCESS(status) ) { 1072 const RBBIDataHeader * builtRules = (const RBBIDataHeader *)udata_getMemory(data); 1073 uint32_t length = builtRules->fLength; 1074 RBBIWithProtectedFunctions * brkItr; 1075 1076 // Try the memory-adopting constructor, need to copy the data first 1077 RBBIDataHeader * rulesCopy = (RBBIDataHeader *) uprv_malloc(length); 1078 if ( rulesCopy ) { 1079 uprv_memcpy( rulesCopy, builtRules, length ); 1080 1081 brkItr = new RBBIWithProtectedFunctions(rulesCopy, status); 1082 if ( U_SUCCESS(status) ) { 1083 delete brkItr; // this should free rulesCopy 1084 } else { 1085 errln("create RuleBasedBreakIterator from RBBIData (adopted): ICU Error \"%s\"\n", u_errorName(status) ); 1086 status = U_ZERO_ERROR;// reset for the next test 1087 uprv_free( rulesCopy ); 1088 } 1089 } 1090 1091 // Now try the non-adopting constructor 1092 brkItr = new RBBIWithProtectedFunctions(builtRules, RBBIWithProtectedFunctions::kDontAdopt, status); 1093 if ( U_SUCCESS(status) ) { 1094 delete brkItr; // this should NOT attempt to free builtRules 1095 if (builtRules->fLength != length) { // sanity check 1096 errln("create RuleBasedBreakIterator from RBBIData (non-adopted): delete affects data\n" ); 1097 } 1098 } else { 1099 errln("create RuleBasedBreakIterator from RBBIData (non-adopted): ICU Error \"%s\"\n", u_errorName(status) ); 1100 } 1101 1102 udata_close(data); 1103 } 1104 } 1105 1106 //--------------------------------------------- 1107 // runIndexedTest 1108 //--------------------------------------------- 1109 1110 void RBBIAPITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ ) 1111 { 1112 if (exec) logln((UnicodeString)"TestSuite RuleBasedBreakIterator API "); 1113 switch (index) { 1114 // case 0: name = "TestConstruction"; if (exec) TestConstruction(); break; 1115 case 0: name = "TestCloneEquals"; if (exec) TestCloneEquals(); break; 1116 case 1: name = "TestgetRules"; if (exec) TestgetRules(); break; 1117 case 2: name = "TestHashCode"; if (exec) TestHashCode(); break; 1118 case 3: name = "TestGetSetAdoptText"; if (exec) TestGetSetAdoptText(); break; 1119 case 4: name = "TestIteration"; if (exec) TestIteration(); break; 1120 case 5: name = "TestBuilder"; if (exec) TestBuilder(); break; 1121 case 6: name = "TestQuoteGrouping"; if (exec) TestQuoteGrouping(); break; 1122 case 7: name = "TestRuleStatus"; if (exec) TestRuleStatus(); break; 1123 case 8: name = "TestRuleStatusVec"; if (exec) TestRuleStatusVec(); break; 1124 case 9: name = "TestBug2190"; if (exec) TestBug2190(); break; 1125 case 10: name = "TestRegistration"; if (exec) TestRegistration(); break; 1126 case 11: name = "TestBoilerPlate"; if (exec) TestBoilerPlate(); break; 1127 case 12: name = "TestRoundtripRules"; if (exec) TestRoundtripRules(); break; 1128 case 13: name = "TestCreateFromRBBIData"; if (exec) TestCreateFromRBBIData(); break; 1129 1130 default: name = ""; break; // needed to end loop 1131 } 1132 } 1133 1134 //--------------------------------------------- 1135 //Internal subroutines 1136 //--------------------------------------------- 1137 1138 void RBBIAPITest::doBoundaryTest(RuleBasedBreakIterator& bi, UnicodeString& text, int32_t *boundaries){ 1139 logln((UnicodeString)"testIsBoundary():"); 1140 int32_t p = 0; 1141 UBool isB; 1142 for (int32_t i = 0; i < text.length(); i++) { 1143 isB = bi.isBoundary(i); 1144 logln((UnicodeString)"bi.isBoundary(" + i + ") -> " + isB); 1145 1146 if (i == boundaries[p]) { 1147 if (!isB) 1148 errln((UnicodeString)"Wrong result from isBoundary() for " + i + (UnicodeString)": expected true, got false"); 1149 p++; 1150 } 1151 else { 1152 if (isB) 1153 errln((UnicodeString)"Wrong result from isBoundary() for " + i + (UnicodeString)": expected false, got true"); 1154 } 1155 } 1156 } 1157 void RBBIAPITest::doTest(UnicodeString& testString, int32_t start, int32_t gotoffset, int32_t expectedOffset, const char* expectedString){ 1158 UnicodeString selected; 1159 UnicodeString expected=CharsToUnicodeString(expectedString); 1160 1161 if(gotoffset != expectedOffset) 1162 errln((UnicodeString)"ERROR:****returned #" + gotoffset + (UnicodeString)" instead of #" + expectedOffset); 1163 if(start <= gotoffset){ 1164 testString.extractBetween(start, gotoffset, selected); 1165 } 1166 else{ 1167 testString.extractBetween(gotoffset, start, selected); 1168 } 1169 if(selected.compare(expected) != 0) 1170 errln(prettify((UnicodeString)"ERROR:****selected \"" + selected + "\" instead of \"" + expected + "\"")); 1171 else 1172 logln(prettify("****selected \"" + selected + "\"")); 1173 } 1174 1175 //--------------------------------------------- 1176 //RBBIWithProtectedFunctions class functions 1177 //--------------------------------------------- 1178 1179 RBBIWithProtectedFunctions::RBBIWithProtectedFunctions(RBBIDataHeader* data, UErrorCode &status) 1180 : RuleBasedBreakIterator(data, status) 1181 { 1182 } 1183 1184 RBBIWithProtectedFunctions::RBBIWithProtectedFunctions(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status) 1185 : RuleBasedBreakIterator(data, RuleBasedBreakIterator::kDontAdopt, status) 1186 { 1187 } 1188 1189 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 1190