1 /******************************************************************** 2 * COPYRIGHT: 3 * Copyright (c) 1999-2013, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ********************************************************************/ 6 /************************************************************************ 7 * Date Name Description 8 * 12/15/99 Madhu Creation. 9 * 01/12/2000 Madhu Updated for changed API and added new tests 10 ************************************************************************/ 11 12 #include "utypeinfo.h" // for 'typeid' to work 13 14 #include "unicode/utypes.h" 15 16 #if !UCONFIG_NO_BREAK_ITERATION 17 18 #include "unicode/utypes.h" 19 #include "unicode/brkiter.h" 20 #include "unicode/rbbi.h" 21 #include "unicode/uchar.h" 22 #include "unicode/utf16.h" 23 #include "unicode/ucnv.h" 24 #include "unicode/schriter.h" 25 #include "unicode/uniset.h" 26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 27 #include "unicode/regex.h" 28 #endif 29 #include "unicode/ustring.h" 30 #include "unicode/utext.h" 31 #include "intltest.h" 32 #include "rbbitst.h" 33 #include <string.h> 34 #include "uvector.h" 35 #include "uvectr32.h" 36 #include <string.h> 37 #include <stdio.h> 38 #include <stdlib.h> 39 #include "unicode/numfmt.h" 40 #include "unicode/uscript.h" 41 42 #define TEST_ASSERT(x) {if (!(x)) { \ 43 errln("Failure in file %s, line %d", __FILE__, __LINE__);}} 44 45 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \ 46 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}} 47 48 49 //--------------------------------------------- 50 // runIndexedTest 51 //--------------------------------------------- 52 53 54 // Note: Before adding new tests to this file, check whether the desired test data can 55 // simply be added to the file testdata/rbbitest.txt. In most cases it can, 56 // it's much less work than writing a new test, diagnostic output in the event of failures 57 // is good, and the test data file will is shared with ICU4J, so eventually the test 58 // will run there as well, without additional effort. 59 60 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params ) 61 { 62 if (exec) logln("TestSuite RuleBasedBreakIterator: "); 63 64 switch (index) { 65 #if !UCONFIG_NO_FILE_IO 66 case 0: name = "TestBug4153072"; 67 if(exec) TestBug4153072(); break; 68 #else 69 case 0: name = "skip"; 70 break; 71 #endif 72 73 case 1: name = "skip"; 74 break; 75 case 2: name = "TestStatusReturn"; 76 if(exec) TestStatusReturn(); break; 77 78 #if !UCONFIG_NO_FILE_IO 79 case 3: name = "TestUnicodeFiles"; 80 if(exec) TestUnicodeFiles(); break; 81 case 4: name = "TestEmptyString"; 82 if(exec) TestEmptyString(); break; 83 #else 84 case 3: case 4: name = "skip"; 85 break; 86 #endif 87 88 case 5: name = "TestGetAvailableLocales"; 89 if(exec) TestGetAvailableLocales(); break; 90 91 case 6: name = "TestGetDisplayName"; 92 if(exec) TestGetDisplayName(); break; 93 94 #if !UCONFIG_NO_FILE_IO 95 case 7: name = "TestEndBehaviour"; 96 if(exec) TestEndBehaviour(); break; 97 case 8: case 9: case 10: name = "skip"; 98 break; 99 case 11: name = "TestWordBreaks"; 100 if(exec) TestWordBreaks(); break; 101 case 12: name = "TestWordBoundary"; 102 if(exec) TestWordBoundary(); break; 103 case 13: name = "TestLineBreaks"; 104 if(exec) TestLineBreaks(); break; 105 case 14: name = "TestSentBreaks"; 106 if(exec) TestSentBreaks(); break; 107 case 15: name = "TestExtended"; 108 if(exec) TestExtended(); break; 109 #else 110 case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip"; 111 break; 112 #endif 113 114 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO 115 case 16: 116 name = "TestMonkey"; if(exec) TestMonkey(params); break; 117 #else 118 case 16: 119 name = "skip"; break; 120 #endif 121 122 #if !UCONFIG_NO_FILE_IO 123 case 17: name = "TestBug3818"; 124 if(exec) TestBug3818(); break; 125 #else 126 case 17: name = "skip"; 127 break; 128 #endif 129 130 case 18: name = "skip"; 131 break; 132 case 19: name = "TestDebug"; 133 if(exec) TestDebug(); break; 134 case 20: name = "skip"; 135 break; 136 137 #if !UCONFIG_NO_FILE_IO 138 case 21: name = "TestBug5775"; 139 if (exec) TestBug5775(); break; 140 #else 141 case 21: name = "skip"; 142 break; 143 #endif 144 145 case 22: name = "TestBug9983"; 146 if (exec) TestBug9983(); break; 147 case 23: name = "TestDictRules"; 148 if (exec) TestDictRules(); break; 149 case 24: name = "TestBug5532"; 150 if (exec) TestBug5532(); break; 151 default: name = ""; break; //needed to end loop 152 } 153 } 154 155 156 //--------------------------------------------------------------------------- 157 // 158 // class BITestData Holds a set of Break iterator test data and results 159 // Includes 160 // - the string data to be broken 161 // - a vector of the expected break positions. 162 // - a vector of source line numbers for the data, 163 // (to help see where errors occured.) 164 // - The expected break tag values. 165 // - Vectors of actual break positions and tag values. 166 // - Functions for comparing actual with expected and 167 // reporting errors. 168 // 169 //---------------------------------------------------------------------------- 170 class BITestData { 171 public: 172 UnicodeString fDataToBreak; 173 UVector fExpectedBreakPositions; 174 UVector fExpectedTags; 175 UVector fLineNum; 176 UVector fActualBreakPositions; // Test Results. 177 UVector fActualTags; 178 179 BITestData(UErrorCode &status); 180 void addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status); 181 void checkResults(const char *heading, RBBITest *test); 182 void err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx); 183 void clearResults(); 184 }; 185 186 // 187 // Constructor. 188 // 189 BITestData::BITestData(UErrorCode &status) 190 : fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fActualBreakPositions(status), 191 fActualTags(status) 192 { 193 } 194 195 // 196 // addDataChunk. Add a section (non-breaking) piece if data to the test data. 197 // The macro form collects the line number, which is helpful 198 // when tracking down failures. 199 // 200 // A null data item is inserted at the start of each test's data 201 // to put the starting zero into the data list. The position saved for 202 // each non-null item is its ending position. 203 // 204 #define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status); 205 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) { 206 if (U_FAILURE(status)) {return;} 207 if (data != NULL) { 208 fDataToBreak.append(CharsToUnicodeString(data)); 209 } 210 fExpectedBreakPositions.addElement(fDataToBreak.length(), status); 211 fExpectedTags.addElement(tag, status); 212 fLineNum.addElement(lineNum, status); 213 } 214 215 216 // 217 // checkResults. Compare the actual and expected break positions, report any differences. 218 // 219 void BITestData::checkResults(const char *heading, RBBITest *test) { 220 int32_t expectedIndex = 0; 221 int32_t actualIndex = 0; 222 223 for (;;) { 224 // If we've run through both the expected and actual results vectors, we're done. 225 // break out of the loop. 226 if (expectedIndex >= fExpectedBreakPositions.size() && 227 actualIndex >= fActualBreakPositions.size()) { 228 break; 229 } 230 231 232 if (expectedIndex >= fExpectedBreakPositions.size()) { 233 err(heading, test, expectedIndex-1, actualIndex); 234 actualIndex++; 235 continue; 236 } 237 238 if (actualIndex >= fActualBreakPositions.size()) { 239 err(heading, test, expectedIndex, actualIndex-1); 240 expectedIndex++; 241 continue; 242 } 243 244 if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) { 245 err(heading, test, expectedIndex, actualIndex); 246 // Try to resync the positions of the indices, to avoid a rash of spurious erros. 247 if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) { 248 actualIndex++; 249 } else { 250 expectedIndex++; 251 } 252 continue; 253 } 254 255 if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) { 256 test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d", 257 heading, fLineNum.elementAt(expectedIndex), 258 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex)); 259 } 260 261 actualIndex++; 262 expectedIndex++; 263 } 264 } 265 266 // 267 // err - An error was found. Report it, along with information about where the 268 // incorrectly broken test data appeared in the source file. 269 // 270 void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx) 271 { 272 int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx); 273 int32_t actual = fActualBreakPositions.elementAti(actualIdx); 274 int32_t o = 0; 275 int32_t line = fLineNum.elementAti(expectedIdx); 276 if (expectedIdx > 0) { 277 // The line numbers are off by one because a premature break occurs somewhere 278 // within the previous item, rather than at the start of the current (expected) item. 279 // We want to report the offset of the unexpected break from the start of 280 // this previous item. 281 o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1); 282 } 283 if (actual < expected) { 284 test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading, o, line, actual, expected); 285 } else { 286 test->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading, line, actual, expected); 287 } 288 } 289 290 291 void BITestData::clearResults() { 292 fActualBreakPositions.removeAllElements(); 293 fActualTags.removeAllElements(); 294 } 295 296 297 //-------------------------------------------------------------------------------------- 298 // 299 // RBBITest constructor and destructor 300 // 301 //-------------------------------------------------------------------------------------- 302 303 RBBITest::RBBITest() { 304 } 305 306 307 RBBITest::~RBBITest() { 308 } 309 310 //----------------------------------------------------------------------------------- 311 // 312 // Test for status {tag} return value from break rules. 313 // TODO: a more thorough test. 314 // 315 //----------------------------------------------------------------------------------- 316 void RBBITest::TestStatusReturn() { 317 UnicodeString rulesString1("$Letters = [:L:];\n" 318 "$Numbers = [:N:];\n" 319 "$Letters+{1};\n" 320 "$Numbers+{2};\n" 321 "Help\\ {4}/me\\!;\n" 322 "[^$Letters $Numbers];\n" 323 "!.*;\n", -1, US_INV); 324 UnicodeString testString1 = "abc123..abc Help me Help me!"; 325 // 01234567890123456789012345678 326 int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1}; 327 int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1}; 328 329 UErrorCode status=U_ZERO_ERROR; 330 UParseError parseError; 331 332 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status); 333 if(U_FAILURE(status)) { 334 dataerrln("FAIL : in construction - %s", u_errorName(status)); 335 } else { 336 int32_t pos; 337 int32_t i = 0; 338 bi->setText(testString1); 339 for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) { 340 if (pos != bounds1[i]) { 341 errln("FAIL: expected break at %d, got %d\n", bounds1[i], pos); 342 break; 343 } 344 345 int tag = bi->getRuleStatus(); 346 if (tag != brkStatus[i]) { 347 errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag); 348 break; 349 } 350 i++; 351 } 352 } 353 delete bi; 354 } 355 356 357 static void printStringBreaks(UnicodeString ustr, int expected[], 358 int expectedcount) 359 { 360 UErrorCode status = U_ZERO_ERROR; 361 char name[100]; 362 printf("code alpha extend alphanum type word sent line name\n"); 363 int j; 364 for (j = 0; j < ustr.length(); j ++) { 365 if (expectedcount > 0) { 366 int k; 367 for (k = 0; k < expectedcount; k ++) { 368 if (j == expected[k]) { 369 printf("------------------------------------------------ %d\n", 370 j); 371 } 372 } 373 } 374 UChar32 c = ustr.char32At(j); 375 if (c > 0xffff) { 376 j ++; 377 } 378 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status); 379 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c, 380 u_isUAlphabetic(c), 381 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND), 382 u_isalnum(c), 383 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY, 384 u_charType(c), 385 U_SHORT_PROPERTY_NAME), 386 u_getPropertyValueName(UCHAR_WORD_BREAK, 387 u_getIntPropertyValue(c, 388 UCHAR_WORD_BREAK), 389 U_SHORT_PROPERTY_NAME), 390 u_getPropertyValueName(UCHAR_SENTENCE_BREAK, 391 u_getIntPropertyValue(c, 392 UCHAR_SENTENCE_BREAK), 393 U_SHORT_PROPERTY_NAME), 394 u_getPropertyValueName(UCHAR_LINE_BREAK, 395 u_getIntPropertyValue(c, 396 UCHAR_LINE_BREAK), 397 U_SHORT_PROPERTY_NAME), 398 name); 399 } 400 } 401 402 403 void RBBITest::TestBug3818() { 404 UErrorCode status = U_ZERO_ERROR; 405 406 // Four Thai words... 407 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 408 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 }; 409 UnicodeString thaiStr(thaiWordData); 410 411 RuleBasedBreakIterator* bi = 412 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status); 413 if (U_FAILURE(status) || bi == NULL) { 414 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status)); 415 return; 416 } 417 bi->setText(thaiStr); 418 419 int32_t startOfSecondWord = bi->following(1); 420 if (startOfSecondWord != 4) { 421 errln("Fail at file %s, line %d expected start of word at 4, got %d", 422 __FILE__, __LINE__, startOfSecondWord); 423 } 424 startOfSecondWord = bi->following(0); 425 if (startOfSecondWord != 4) { 426 errln("Fail at file %s, line %d expected start of word at 4, got %d", 427 __FILE__, __LINE__, startOfSecondWord); 428 } 429 delete bi; 430 } 431 432 //---------------------------------------------------------------------------- 433 // 434 // generalIteratorTest Given a break iterator and a set of test data, 435 // Run the tests and report the results. 436 // 437 //---------------------------------------------------------------------------- 438 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td) 439 { 440 441 bi.setText(td.fDataToBreak); 442 443 testFirstAndNext(bi, td); 444 445 testLastAndPrevious(bi, td); 446 447 testFollowing(bi, td); 448 testPreceding(bi, td); 449 testIsBoundary(bi, td); 450 doMultipleSelectionTest(bi, td); 451 } 452 453 454 // 455 // testFirstAndNext. Run the iterator forwards in the obvious first(), next() 456 // kind of loop. 457 // 458 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td) 459 { 460 UErrorCode status = U_ZERO_ERROR; 461 int32_t p; 462 int32_t lastP = -1; 463 int32_t tag; 464 465 logln("Test first and next"); 466 bi.setText(td.fDataToBreak); 467 td.clearResults(); 468 469 for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) { 470 td.fActualBreakPositions.addElement(p, status); // Save result. 471 tag = bi.getRuleStatus(); 472 td.fActualTags.addElement(tag, status); 473 if (p <= lastP) { 474 // If the iterator is not making forward progress, stop. 475 // No need to raise an error here, it'll be detected in the normal check of results. 476 break; 477 } 478 lastP = p; 479 } 480 td.checkResults("testFirstAndNext", this); 481 } 482 483 484 // 485 // TestLastAndPrevious. Run the iterator backwards, starting with last(). 486 // 487 void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td) 488 { 489 UErrorCode status = U_ZERO_ERROR; 490 int32_t p; 491 int32_t lastP = 0x7ffffffe; 492 int32_t tag; 493 494 logln("Test last and previous"); 495 bi.setText(td.fDataToBreak); 496 td.clearResults(); 497 498 for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) { 499 // Save break position. Insert it at start of vector of results, shoving 500 // already-saved results further towards the end. 501 td.fActualBreakPositions.insertElementAt(p, 0, status); 502 // bi.previous(); // TODO: Why does this fix things up???? 503 // bi.next(); 504 tag = bi.getRuleStatus(); 505 td.fActualTags.insertElementAt(tag, 0, status); 506 if (p >= lastP) { 507 // If the iterator is not making progress, stop. 508 // No need to raise an error here, it'll be detected in the normal check of results. 509 break; 510 } 511 lastP = p; 512 } 513 td.checkResults("testLastAndPrevious", this); 514 } 515 516 517 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td) 518 { 519 UErrorCode status = U_ZERO_ERROR; 520 int32_t p; 521 int32_t tag; 522 int32_t lastP = -2; // A value that will never be returned as a break position. 523 // cannot be -1; that is returned for DONE. 524 int i; 525 526 logln("testFollowing():"); 527 bi.setText(td.fDataToBreak); 528 td.clearResults(); 529 530 // Save the starting point, since we won't get that out of following. 531 p = bi.first(); 532 td.fActualBreakPositions.addElement(p, status); // Save result. 533 tag = bi.getRuleStatus(); 534 td.fActualTags.addElement(tag, status); 535 536 for (i = 0; i <= td.fDataToBreak.length()+1; i++) { 537 p = bi.following(i); 538 if (p != lastP) { 539 if (p == RuleBasedBreakIterator::DONE) { 540 break; 541 } 542 // We've reached a new break position. Save it. 543 td.fActualBreakPositions.addElement(p, status); // Save result. 544 tag = bi.getRuleStatus(); 545 td.fActualTags.addElement(tag, status); 546 lastP = p; 547 } 548 } 549 // The loop normally exits by means of the break in the middle. 550 // Make sure that the index was at the correct position for the break iterator to have 551 // returned DONE. 552 if (i != td.fDataToBreak.length()) { 553 errln("testFollowing(): iterator returned DONE prematurely."); 554 } 555 556 // Full check of all results. 557 td.checkResults("testFollowing", this); 558 } 559 560 561 562 void RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) { 563 UErrorCode status = U_ZERO_ERROR; 564 int32_t p; 565 int32_t tag; 566 int32_t lastP = 0x7ffffffe; 567 int i; 568 569 logln("testPreceding():"); 570 bi.setText(td.fDataToBreak); 571 td.clearResults(); 572 573 p = bi.last(); 574 td.fActualBreakPositions.addElement(p, status); 575 tag = bi.getRuleStatus(); 576 td.fActualTags.addElement(tag, status); 577 578 for (i = td.fDataToBreak.length(); i>=-1; i--) { 579 p = bi.preceding(i); 580 if (p != lastP) { 581 if (p == RuleBasedBreakIterator::DONE) { 582 break; 583 } 584 // We've reached a new break position. Save it. 585 td.fActualBreakPositions.insertElementAt(p, 0, status); 586 lastP = p; 587 tag = bi.getRuleStatus(); 588 td.fActualTags.insertElementAt(tag, 0, status); 589 } 590 } 591 // The loop normally exits by means of the break in the middle. 592 // Make sure that the index was at the correct position for the break iterator to have 593 // returned DONE. 594 if (i != 0) { 595 errln("testPreceding(): iterator returned DONE prematurely."); 596 } 597 598 // Full check of all results. 599 td.checkResults("testPreceding", this); 600 } 601 602 603 604 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) { 605 UErrorCode status = U_ZERO_ERROR; 606 int i; 607 int32_t tag; 608 609 logln("testIsBoundary():"); 610 bi.setText(td.fDataToBreak); 611 td.clearResults(); 612 613 for (i = 0; i <= td.fDataToBreak.length(); i++) { 614 if (bi.isBoundary(i)) { 615 td.fActualBreakPositions.addElement(i, status); // Save result. 616 tag = bi.getRuleStatus(); 617 td.fActualTags.addElement(tag, status); 618 } 619 } 620 td.checkResults("testIsBoundary: ", this); 621 } 622 623 624 625 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td) 626 { 627 iterator.setText(td.fDataToBreak); 628 629 RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone(); 630 int32_t offset = iterator.first(); 631 int32_t testOffset; 632 int32_t count = 0; 633 634 logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length()); 635 636 if (*testIterator != iterator) 637 errln("clone() or operator!= failed: two clones compared unequal"); 638 639 do { 640 testOffset = testIterator->first(); 641 testOffset = testIterator->next(count); 642 if (offset != testOffset) 643 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset); 644 645 if (offset != RuleBasedBreakIterator::DONE) { 646 count++; 647 offset = iterator.next(); 648 649 if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) { 650 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset); 651 if (count > 10000 || offset == -1) { 652 errln("operator== failed too many times. Stopping test."); 653 if (offset == -1) { 654 errln("Does (RuleBasedBreakIterator::DONE == -1)?"); 655 } 656 return; 657 } 658 } 659 } 660 } while (offset != RuleBasedBreakIterator::DONE); 661 662 // now do it backwards... 663 offset = iterator.last(); 664 count = 0; 665 666 do { 667 testOffset = testIterator->last(); 668 testOffset = testIterator->next(count); // next() with a negative arg is same as previous 669 if (offset != testOffset) 670 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset); 671 672 if (offset != RuleBasedBreakIterator::DONE) { 673 count--; 674 offset = iterator.previous(); 675 } 676 } while (offset != RuleBasedBreakIterator::DONE); 677 678 delete testIterator; 679 } 680 681 682 //--------------------------------------------- 683 // 684 // other tests 685 // 686 //--------------------------------------------- 687 void RBBITest::TestEmptyString() 688 { 689 UnicodeString text = ""; 690 UErrorCode status = U_ZERO_ERROR; 691 692 BITestData x(status); 693 ADD_DATACHUNK(x, "", 0, status); // Break at start of data 694 RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); 695 if (U_FAILURE(status)) 696 { 697 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status)); 698 return; 699 } 700 generalIteratorTest(*bi, x); 701 delete bi; 702 } 703 704 void RBBITest::TestGetAvailableLocales() 705 { 706 int32_t locCount = 0; 707 const Locale* locList = BreakIterator::getAvailableLocales(locCount); 708 709 if (locCount == 0) 710 dataerrln("getAvailableLocales() returned an empty list!"); 711 // Just make sure that it's returning good memory. 712 int32_t i; 713 for (i = 0; i < locCount; ++i) { 714 logln(locList[i].getName()); 715 } 716 } 717 718 //Testing the BreakIterator::getDisplayName() function 719 void RBBITest::TestGetDisplayName() 720 { 721 UnicodeString result; 722 723 BreakIterator::getDisplayName(Locale::getUS(), result); 724 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)") 725 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \"" 726 + result); 727 728 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result); 729 if (result != "French (France)") 730 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \"" 731 + result); 732 } 733 /** 734 * Test End Behaviour 735 * @bug 4068137 736 */ 737 void RBBITest::TestEndBehaviour() 738 { 739 UErrorCode status = U_ZERO_ERROR; 740 UnicodeString testString("boo."); 741 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status); 742 if (U_FAILURE(status)) 743 { 744 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status)); 745 return; 746 } 747 wb->setText(testString); 748 749 if (wb->first() != 0) 750 errln("Didn't get break at beginning of string."); 751 if (wb->next() != 3) 752 errln("Didn't get break before period in \"boo.\""); 753 if (wb->current() != 4 && wb->next() != 4) 754 errln("Didn't get break at end of string."); 755 delete wb; 756 } 757 /* 758 * @bug 4153072 759 */ 760 void RBBITest::TestBug4153072() { 761 UErrorCode status = U_ZERO_ERROR; 762 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status); 763 if (U_FAILURE(status)) 764 { 765 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status)); 766 return; 767 } 768 UnicodeString str("...Hello, World!..."); 769 int32_t begin = 3; 770 int32_t end = str.length() - 3; 771 UBool onBoundary; 772 773 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin); 774 iter->adoptText(textIterator); 775 int index; 776 // Note: with the switch to UText, there is no way to restrict the 777 // iteration range to begin at an index other than zero. 778 // String character iterators created with a non-zero bound are 779 // treated by RBBI as being empty. 780 for (index = -1; index < begin + 1; ++index) { 781 onBoundary = iter->isBoundary(index); 782 if (index == 0? !onBoundary : onBoundary) { 783 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index + 784 " and begin index = " + begin); 785 } 786 } 787 delete iter; 788 } 789 790 791 // 792 // Test for problem reported by Ashok Matoria on 9 July 2007 793 // One.<kSoftHyphen><kSpace>Two. 794 // 795 // Sentence break at start (0) and then on calling next() it breaks at 796 // 'T' of "Two". Now, at this point if I do next() and 797 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two". 798 // 799 void RBBITest::TestBug5775() { 800 UErrorCode status = U_ZERO_ERROR; 801 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status); 802 TEST_ASSERT_SUCCESS(status); 803 if (U_FAILURE(status)) { 804 return; 805 } 806 // Check for status first for better handling of no data errors. 807 TEST_ASSERT(bi != NULL); 808 if (bi == NULL) { 809 return; 810 } 811 812 UnicodeString s("One.\\u00ad Two.", -1, US_INV); 813 // 01234 56789 814 s = s.unescape(); 815 bi->setText(s); 816 int pos = bi->next(); 817 TEST_ASSERT(pos == 6); 818 pos = bi->next(); 819 TEST_ASSERT(pos == 10); 820 pos = bi->previous(); 821 TEST_ASSERT(pos == 6); 822 delete bi; 823 } 824 825 826 827 //------------------------------------------------------------------------------ 828 // 829 // RBBITest::Extended Run RBBI Tests from an external test data file 830 // 831 //------------------------------------------------------------------------------ 832 833 struct TestParams { 834 BreakIterator *bi; 835 UnicodeString dataToBreak; 836 UVector32 *expectedBreaks; 837 UVector32 *srcLine; 838 UVector32 *srcCol; 839 }; 840 841 void RBBITest::executeTest(TestParams *t) { 842 int32_t bp; 843 int32_t prevBP; 844 int32_t i; 845 846 if (t->bi == NULL) { 847 return; 848 } 849 850 t->bi->setText(t->dataToBreak); 851 // 852 // Run the iterator forward 853 // 854 prevBP = -1; 855 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) { 856 if (prevBP == bp) { 857 // Fail for lack of forward progress. 858 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d", 859 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 860 break; 861 } 862 863 // Check that there were we didn't miss an expected break between the last one 864 // and this one. 865 for (i=prevBP+1; i<bp; i++) { 866 if (t->expectedBreaks->elementAti(i) != 0) { 867 int expected[] = {0, i}; 868 printStringBreaks(t->dataToBreak, expected, 2); 869 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 870 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 871 } 872 } 873 874 // Check that the break we did find was expected 875 if (t->expectedBreaks->elementAti(bp) == 0) { 876 int expected[] = {0, bp}; 877 printStringBreaks(t->dataToBreak, expected, 2); 878 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d", 879 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 880 } else { 881 // The break was expected. 882 // Check that the {nnn} tag value is correct. 883 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp); 884 if (expectedTagVal == -1) { 885 expectedTagVal = 0; 886 } 887 int32_t line = t->srcLine->elementAti(bp); 888 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus(); 889 if (rs != expectedTagVal) { 890 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n" 891 " Actual, Expected status = %4d, %4d", 892 bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal); 893 } 894 } 895 896 897 prevBP = bp; 898 } 899 900 // Verify that there were no missed expected breaks after the last one found 901 for (i=prevBP+1; i<t->expectedBreaks->size(); i++) { 902 if (t->expectedBreaks->elementAti(i) != 0) { 903 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 904 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 905 } 906 } 907 908 // 909 // Run the iterator backwards, verify that the same breaks are found. 910 // 911 prevBP = t->dataToBreak.length()+2; // start with a phony value for the last break pos seen. 912 for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) { 913 if (prevBP == bp) { 914 // Fail for lack of progress. 915 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d", 916 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 917 break; 918 } 919 920 // Check that there were we didn't miss an expected break between the last one 921 // and this one. (UVector returns zeros for index out of bounds.) 922 for (i=prevBP-1; i>bp; i--) { 923 if (t->expectedBreaks->elementAti(i) != 0) { 924 errln("Reverse Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 925 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 926 } 927 } 928 929 // Check that the break we did find was expected 930 if (t->expectedBreaks->elementAti(bp) == 0) { 931 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d", 932 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 933 } else { 934 // The break was expected. 935 // Check that the {nnn} tag value is correct. 936 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp); 937 if (expectedTagVal == -1) { 938 expectedTagVal = 0; 939 } 940 int line = t->srcLine->elementAti(bp); 941 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus(); 942 if (rs != expectedTagVal) { 943 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n" 944 " Actual, Expected status = %4d, %4d", 945 bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal); 946 } 947 } 948 949 prevBP = bp; 950 } 951 952 // Verify that there were no missed breaks prior to the last one found 953 for (i=prevBP-1; i>=0; i--) { 954 if (t->expectedBreaks->elementAti(i) != 0) { 955 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 956 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 957 } 958 } 959 960 // Check isBoundary() 961 for (i=0; i<t->expectedBreaks->size(); i++) { 962 UBool boundaryExpected = (t->expectedBreaks->elementAti(i) != 0); 963 UBool boundaryFound = t->bi->isBoundary(i); 964 if (boundaryExpected != boundaryFound) { 965 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n" 966 " Expected, Actual= %s, %s", 967 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), 968 boundaryExpected ? "true":"false", boundaryFound? "true" : "false"); 969 } 970 } 971 972 // Check following() 973 for (i=0; i<t->expectedBreaks->size(); i++) { 974 int32_t actualBreak = t->bi->following(i); 975 int32_t expectedBreak = BreakIterator::DONE; 976 for (int32_t j=i+1; j < t->expectedBreaks->size(); j++) { 977 if (t->expectedBreaks->elementAti(j) != 0) { 978 expectedBreak = j; 979 break; 980 } 981 } 982 if (expectedBreak != actualBreak) { 983 errln("following(%d) incorrect. File line,col= %4d,%4d\n" 984 " Expected, Actual= %d, %d", 985 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), expectedBreak, actualBreak); 986 } 987 } 988 989 // Check preceding() 990 for (i=t->expectedBreaks->size(); i>=0; i--) { 991 int32_t actualBreak = t->bi->preceding(i); 992 int32_t expectedBreak = BreakIterator::DONE; 993 994 for (int32_t j=i-1; j >= 0; j--) { 995 if (t->expectedBreaks->elementAti(j) != 0) { 996 expectedBreak = j; 997 break; 998 } 999 } 1000 if (expectedBreak != actualBreak) { 1001 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n" 1002 " Expected, Actual= %d, %d", 1003 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), expectedBreak, actualBreak); 1004 } 1005 } 1006 } 1007 1008 1009 void RBBITest::TestExtended() { 1010 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 1011 UErrorCode status = U_ZERO_ERROR; 1012 Locale locale(""); 1013 1014 UnicodeString rules; 1015 TestParams tp; 1016 tp.bi = NULL; 1017 tp.expectedBreaks = new UVector32(status); 1018 tp.srcLine = new UVector32(status); 1019 tp.srcCol = new UVector32(status); 1020 1021 RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status); 1022 if (U_FAILURE(status)) { 1023 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status)); 1024 } 1025 1026 1027 // 1028 // Open and read the test data file. 1029 // 1030 const char *testDataDirectory = IntlTest::getSourceTestData(status); 1031 char testFileName[1000]; 1032 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) { 1033 errln("Can't open test data. Path too long."); 1034 return; 1035 } 1036 strcpy(testFileName, testDataDirectory); 1037 strcat(testFileName, "rbbitst.txt"); 1038 1039 int len; 1040 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status); 1041 if (U_FAILURE(status)) { 1042 return; /* something went wrong, error already output */ 1043 } 1044 1045 1046 1047 1048 // 1049 // Put the test data into a UnicodeString 1050 // 1051 UnicodeString testString(FALSE, testFile, len); 1052 1053 enum EParseState{ 1054 PARSE_COMMENT, 1055 PARSE_TAG, 1056 PARSE_DATA, 1057 PARSE_NUM 1058 } 1059 parseState = PARSE_TAG; 1060 1061 EParseState savedState = PARSE_TAG; 1062 1063 static const UChar CH_LF = 0x0a; 1064 static const UChar CH_CR = 0x0d; 1065 static const UChar CH_HASH = 0x23; 1066 /*static const UChar CH_PERIOD = 0x2e;*/ 1067 static const UChar CH_LT = 0x3c; 1068 static const UChar CH_GT = 0x3e; 1069 static const UChar CH_BACKSLASH = 0x5c; 1070 static const UChar CH_BULLET = 0x2022; 1071 1072 int32_t lineNum = 1; 1073 int32_t colStart = 0; 1074 int32_t column = 0; 1075 int32_t charIdx = 0; 1076 1077 int32_t tagValue = 0; // The numeric value of a <nnn> tag. 1078 1079 for (charIdx = 0; charIdx < len; ) { 1080 status = U_ZERO_ERROR; 1081 UChar c = testString.charAt(charIdx); 1082 charIdx++; 1083 if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) { 1084 // treat CRLF as a unit 1085 c = CH_LF; 1086 charIdx++; 1087 } 1088 if (c == CH_LF || c == CH_CR) { 1089 lineNum++; 1090 colStart = charIdx; 1091 } 1092 column = charIdx - colStart + 1; 1093 1094 switch (parseState) { 1095 case PARSE_COMMENT: 1096 if (c == 0x0a || c == 0x0d) { 1097 parseState = savedState; 1098 } 1099 break; 1100 1101 case PARSE_TAG: 1102 { 1103 if (c == CH_HASH) { 1104 parseState = PARSE_COMMENT; 1105 savedState = PARSE_TAG; 1106 break; 1107 } 1108 if (u_isUWhiteSpace(c)) { 1109 break; 1110 } 1111 if (testString.compare(charIdx-1, 6, "<word>") == 0) { 1112 delete tp.bi; 1113 tp.bi = BreakIterator::createWordInstance(locale, status); 1114 charIdx += 5; 1115 break; 1116 } 1117 if (testString.compare(charIdx-1, 6, "<char>") == 0) { 1118 delete tp.bi; 1119 tp.bi = BreakIterator::createCharacterInstance(locale, status); 1120 charIdx += 5; 1121 break; 1122 } 1123 if (testString.compare(charIdx-1, 6, "<line>") == 0) { 1124 delete tp.bi; 1125 tp.bi = BreakIterator::createLineInstance(locale, status); 1126 charIdx += 5; 1127 break; 1128 } 1129 if (testString.compare(charIdx-1, 6, "<sent>") == 0) { 1130 delete tp.bi; 1131 tp.bi = NULL; 1132 tp.bi = BreakIterator::createSentenceInstance(locale, status); 1133 charIdx += 5; 1134 break; 1135 } 1136 if (testString.compare(charIdx-1, 7, "<title>") == 0) { 1137 delete tp.bi; 1138 tp.bi = BreakIterator::createTitleInstance(locale, status); 1139 charIdx += 6; 1140 break; 1141 } 1142 1143 // <locale loc_name> 1144 localeMatcher.reset(testString); 1145 if (localeMatcher.lookingAt(charIdx-1, status)) { 1146 UnicodeString localeName = localeMatcher.group(1, status); 1147 char localeName8[100]; 1148 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0); 1149 locale = Locale::createFromName(localeName8); 1150 charIdx += localeMatcher.group(0, status).length() - 1; 1151 TEST_ASSERT_SUCCESS(status); 1152 break; 1153 } 1154 if (testString.compare(charIdx-1, 6, "<data>") == 0) { 1155 parseState = PARSE_DATA; 1156 charIdx += 5; 1157 tp.dataToBreak = ""; 1158 tp.expectedBreaks->removeAllElements(); 1159 tp.srcCol ->removeAllElements(); 1160 tp.srcLine->removeAllElements(); 1161 break; 1162 } 1163 1164 errln("line %d: Tag expected in test file.", lineNum); 1165 parseState = PARSE_COMMENT; 1166 savedState = PARSE_DATA; 1167 goto end_test; // Stop the test. 1168 } 1169 break; 1170 1171 case PARSE_DATA: 1172 if (c == CH_BULLET) { 1173 int32_t breakIdx = tp.dataToBreak.length(); 1174 tp.expectedBreaks->setSize(breakIdx+1); 1175 tp.expectedBreaks->setElementAt(-1, breakIdx); 1176 tp.srcLine->setSize(breakIdx+1); 1177 tp.srcLine->setElementAt(lineNum, breakIdx); 1178 tp.srcCol ->setSize(breakIdx+1); 1179 tp.srcCol ->setElementAt(column, breakIdx); 1180 break; 1181 } 1182 1183 if (testString.compare(charIdx-1, 7, "</data>") == 0) { 1184 // Add final entry to mappings from break location to source file position. 1185 // Need one extra because last break position returned is after the 1186 // last char in the data, not at the last char. 1187 tp.srcLine->addElement(lineNum, status); 1188 tp.srcCol ->addElement(column, status); 1189 1190 parseState = PARSE_TAG; 1191 charIdx += 6; 1192 1193 // RUN THE TEST! 1194 executeTest(&tp); 1195 break; 1196 } 1197 1198 if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) { 1199 // Named character, e.g. \N{COMBINING GRAVE ACCENT} 1200 // Get the code point from the name and insert it into the test data. 1201 // (Damn, no API takes names in Unicode !!! 1202 // we've got to take it back to char *) 1203 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx); 1204 int32_t nameLength = nameEndIdx - (charIdx+2); 1205 char charNameBuf[200]; 1206 UChar32 theChar = -1; 1207 if (nameEndIdx != -1) { 1208 UErrorCode status = U_ZERO_ERROR; 1209 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf)); 1210 charNameBuf[sizeof(charNameBuf)-1] = 0; 1211 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status); 1212 if (U_FAILURE(status)) { 1213 theChar = -1; 1214 } 1215 } 1216 if (theChar == -1) { 1217 errln("Error in named character in test file at line %d, col %d", 1218 lineNum, column); 1219 } else { 1220 // Named code point was recognized. Insert it 1221 // into the test data. 1222 tp.dataToBreak.append(theChar); 1223 while (tp.dataToBreak.length() > tp.srcLine->size()) { 1224 tp.srcLine->addElement(lineNum, status); 1225 tp.srcCol ->addElement(column, status); 1226 } 1227 } 1228 if (nameEndIdx > charIdx) { 1229 charIdx = nameEndIdx+1; 1230 1231 } 1232 break; 1233 } 1234 1235 1236 1237 1238 if (testString.compare(charIdx-1, 2, "<>") == 0) { 1239 charIdx++; 1240 int32_t breakIdx = tp.dataToBreak.length(); 1241 tp.expectedBreaks->setSize(breakIdx+1); 1242 tp.expectedBreaks->setElementAt(-1, breakIdx); 1243 tp.srcLine->setSize(breakIdx+1); 1244 tp.srcLine->setElementAt(lineNum, breakIdx); 1245 tp.srcCol ->setSize(breakIdx+1); 1246 tp.srcCol ->setElementAt(column, breakIdx); 1247 break; 1248 } 1249 1250 if (c == CH_LT) { 1251 tagValue = 0; 1252 parseState = PARSE_NUM; 1253 break; 1254 } 1255 1256 if (c == CH_HASH && column==3) { // TODO: why is column off so far? 1257 parseState = PARSE_COMMENT; 1258 savedState = PARSE_DATA; 1259 break; 1260 } 1261 1262 if (c == CH_BACKSLASH) { 1263 // Check for \ at end of line, a line continuation. 1264 // Advance over (discard) the newline 1265 UChar32 cp = testString.char32At(charIdx); 1266 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) { 1267 // We have a CR LF 1268 // Need an extra increment of the input ptr to move over both of them 1269 charIdx++; 1270 } 1271 if (cp == CH_LF || cp == CH_CR) { 1272 lineNum++; 1273 colStart = charIdx; 1274 charIdx++; 1275 break; 1276 } 1277 1278 // Let unescape handle the back slash. 1279 cp = testString.unescapeAt(charIdx); 1280 if (cp != -1) { 1281 // Escape sequence was recognized. Insert the char 1282 // into the test data. 1283 tp.dataToBreak.append(cp); 1284 while (tp.dataToBreak.length() > tp.srcLine->size()) { 1285 tp.srcLine->addElement(lineNum, status); 1286 tp.srcCol ->addElement(column, status); 1287 } 1288 break; 1289 } 1290 1291 1292 // Not a recognized backslash escape sequence. 1293 // Take the next char as a literal. 1294 // TODO: Should this be an error? 1295 c = testString.charAt(charIdx); 1296 charIdx = testString.moveIndex32(charIdx, 1); 1297 } 1298 1299 // Normal, non-escaped data char. 1300 tp.dataToBreak.append(c); 1301 1302 // Save the mapping from offset in the data to line/column numbers in 1303 // the original input file. Will be used for better error messages only. 1304 // If there's an expected break before this char, the slot in the mapping 1305 // vector will already be set for this char; don't overwrite it. 1306 if (tp.dataToBreak.length() > tp.srcLine->size()) { 1307 tp.srcLine->addElement(lineNum, status); 1308 tp.srcCol ->addElement(column, status); 1309 } 1310 break; 1311 1312 1313 case PARSE_NUM: 1314 // We are parsing an expected numeric tag value, like <1234>, 1315 // within a chunk of data. 1316 if (u_isUWhiteSpace(c)) { 1317 break; 1318 } 1319 1320 if (c == CH_GT) { 1321 // Finished the number. Add the info to the expected break data, 1322 // and switch parse state back to doing plain data. 1323 parseState = PARSE_DATA; 1324 if (tagValue == 0) { 1325 tagValue = -1; 1326 } 1327 int32_t breakIdx = tp.dataToBreak.length(); 1328 tp.expectedBreaks->setSize(breakIdx+1); 1329 tp.expectedBreaks->setElementAt(tagValue, breakIdx); 1330 tp.srcLine->setSize(breakIdx+1); 1331 tp.srcLine->setElementAt(lineNum, breakIdx); 1332 tp.srcCol ->setSize(breakIdx+1); 1333 tp.srcCol ->setElementAt(column, breakIdx); 1334 break; 1335 } 1336 1337 if (u_isdigit(c)) { 1338 tagValue = tagValue*10 + u_charDigitValue(c); 1339 break; 1340 } 1341 1342 errln("Syntax Error in test file at line %d, col %d", 1343 lineNum, column); 1344 parseState = PARSE_COMMENT; 1345 goto end_test; // Stop the test 1346 break; 1347 } 1348 1349 1350 if (U_FAILURE(status)) { 1351 dataerrln("ICU Error %s while parsing test file at line %d.", 1352 u_errorName(status), lineNum); 1353 status = U_ZERO_ERROR; 1354 goto end_test; // Stop the test 1355 } 1356 1357 } 1358 1359 end_test: 1360 delete tp.bi; 1361 delete tp.expectedBreaks; 1362 delete tp.srcLine; 1363 delete tp.srcCol; 1364 delete [] testFile; 1365 #endif 1366 } 1367 1368 1369 //------------------------------------------------------------------------------- 1370 // 1371 // TestDictRules create a break iterator from source rules that includes a 1372 // dictionary range. Regression for bug #7130. Source rules 1373 // do not declare a break iterator type (word, line, sentence, etc. 1374 // but the dictionary code, without a type, would loop. 1375 // 1376 //------------------------------------------------------------------------------- 1377 void RBBITest::TestDictRules() { 1378 const char *rules = "$dictionary = [a-z]; \n" 1379 "!!forward; \n" 1380 "$dictionary $dictionary; \n" 1381 "!!reverse; \n" 1382 "$dictionary $dictionary; \n"; 1383 const char *text = "aa"; 1384 UErrorCode status = U_ZERO_ERROR; 1385 UParseError parseError; 1386 1387 RuleBasedBreakIterator bi(rules, parseError, status); 1388 if (U_SUCCESS(status)) { 1389 UnicodeString utext = text; 1390 bi.setText(utext); 1391 int32_t position; 1392 int32_t loops; 1393 for (loops = 0; loops<10; loops++) { 1394 position = bi.next(); 1395 if (position == RuleBasedBreakIterator::DONE) { 1396 break; 1397 } 1398 } 1399 TEST_ASSERT(loops == 1); 1400 } else { 1401 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status)); 1402 } 1403 } 1404 1405 1406 1407 //------------------------------------------------------------------------------- 1408 // 1409 // ReadAndConvertFile Read a text data file, convert it to UChars, and 1410 // return the datain one big UChar * buffer, which the caller must delete. 1411 // 1412 // parameters: 1413 // fileName: the name of the file, with no directory part. The test data directory 1414 // is assumed. 1415 // ulen an out parameter, receives the actual length (in UChars) of the file data. 1416 // encoding The file encoding. If the file contains a BOM, that will override the encoding 1417 // specified here. The BOM, if it exists, will be stripped from the returned data. 1418 // Pass NULL for the system default encoding. 1419 // status 1420 // returns: 1421 // The file data, converted to UChar. 1422 // The caller must delete this when done with 1423 // delete [] theBuffer; 1424 // 1425 // TODO: This is a clone of RegexTest::ReadAndConvertFile. 1426 // Move this function to some common place. 1427 // 1428 //-------------------------------------------------------------------------------- 1429 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) { 1430 UChar *retPtr = NULL; 1431 char *fileBuf = NULL; 1432 UConverter* conv = NULL; 1433 FILE *f = NULL; 1434 1435 ulen = 0; 1436 if (U_FAILURE(status)) { 1437 return retPtr; 1438 } 1439 1440 // 1441 // Open the file. 1442 // 1443 f = fopen(fileName, "rb"); 1444 if (f == 0) { 1445 dataerrln("Error opening test data file %s\n", fileName); 1446 status = U_FILE_ACCESS_ERROR; 1447 return NULL; 1448 } 1449 // 1450 // Read it in 1451 // 1452 int fileSize; 1453 int amt_read; 1454 1455 fseek( f, 0, SEEK_END); 1456 fileSize = ftell(f); 1457 fileBuf = new char[fileSize]; 1458 fseek(f, 0, SEEK_SET); 1459 amt_read = fread(fileBuf, 1, fileSize, f); 1460 if (amt_read != fileSize || fileSize <= 0) { 1461 errln("Error reading test data file."); 1462 goto cleanUpAndReturn; 1463 } 1464 1465 // 1466 // Look for a Unicode Signature (BOM) on the data just read 1467 // 1468 int32_t signatureLength; 1469 const char * fileBufC; 1470 const char* bomEncoding; 1471 1472 fileBufC = fileBuf; 1473 bomEncoding = ucnv_detectUnicodeSignature( 1474 fileBuf, fileSize, &signatureLength, &status); 1475 if(bomEncoding!=NULL ){ 1476 fileBufC += signatureLength; 1477 fileSize -= signatureLength; 1478 encoding = bomEncoding; 1479 } 1480 1481 // 1482 // Open a converter to take the rule file to UTF-16 1483 // 1484 conv = ucnv_open(encoding, &status); 1485 if (U_FAILURE(status)) { 1486 goto cleanUpAndReturn; 1487 } 1488 1489 // 1490 // Convert the rules to UChar. 1491 // Preflight first to determine required buffer size. 1492 // 1493 ulen = ucnv_toUChars(conv, 1494 NULL, // dest, 1495 0, // destCapacity, 1496 fileBufC, 1497 fileSize, 1498 &status); 1499 if (status == U_BUFFER_OVERFLOW_ERROR) { 1500 // Buffer Overflow is expected from the preflight operation. 1501 status = U_ZERO_ERROR; 1502 1503 retPtr = new UChar[ulen+1]; 1504 ucnv_toUChars(conv, 1505 retPtr, // dest, 1506 ulen+1, 1507 fileBufC, 1508 fileSize, 1509 &status); 1510 } 1511 1512 cleanUpAndReturn: 1513 fclose(f); 1514 delete []fileBuf; 1515 ucnv_close(conv); 1516 if (U_FAILURE(status)) { 1517 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 1518 delete []retPtr; 1519 retPtr = 0; 1520 ulen = 0; 1521 }; 1522 return retPtr; 1523 } 1524 1525 1526 1527 //-------------------------------------------------------------------------------------------- 1528 // 1529 // Run tests from each of the boundary test data files distributed by the Unicode Consortium 1530 // 1531 //------------------------------------------------------------------------------------------- 1532 void RBBITest::TestUnicodeFiles() { 1533 RuleBasedBreakIterator *bi; 1534 UErrorCode status = U_ZERO_ERROR; 1535 1536 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status); 1537 TEST_ASSERT_SUCCESS(status); 1538 if (U_SUCCESS(status)) { 1539 runUnicodeTestData("GraphemeBreakTest.txt", bi); 1540 } 1541 delete bi; 1542 1543 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status); 1544 TEST_ASSERT_SUCCESS(status); 1545 if (U_SUCCESS(status)) { 1546 runUnicodeTestData("WordBreakTest.txt", bi); 1547 } 1548 delete bi; 1549 1550 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status); 1551 TEST_ASSERT_SUCCESS(status); 1552 if (U_SUCCESS(status)) { 1553 runUnicodeTestData("SentenceBreakTest.txt", bi); 1554 } 1555 delete bi; 1556 1557 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status); 1558 TEST_ASSERT_SUCCESS(status); 1559 if (U_SUCCESS(status)) { 1560 runUnicodeTestData("LineBreakTest.txt", bi); 1561 } 1562 delete bi; 1563 } 1564 1565 1566 //-------------------------------------------------------------------------------------------- 1567 // 1568 // Run tests from one of the boundary test data files distributed by the Unicode Consortium 1569 // 1570 //------------------------------------------------------------------------------------------- 1571 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) { 1572 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 1573 // TODO(andy): Match line break behavior to Unicode 6.0 and remove this time bomb. Ticket #7270 1574 UBool isTicket7270Fixed = isICUVersionAtLeast(52, 1); 1575 UBool isLineBreak = 0 == strcmp(fileName, "LineBreakTest.txt"); 1576 UErrorCode status = U_ZERO_ERROR; 1577 1578 // 1579 // Open and read the test data file, put it into a UnicodeString. 1580 // 1581 const char *testDataDirectory = IntlTest::getSourceTestData(status); 1582 char testFileName[1000]; 1583 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) { 1584 dataerrln("Can't open test data. Path too long."); 1585 return; 1586 } 1587 strcpy(testFileName, testDataDirectory); 1588 strcat(testFileName, fileName); 1589 1590 logln("Opening data file %s\n", fileName); 1591 1592 int len; 1593 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status); 1594 if (status != U_FILE_ACCESS_ERROR) { 1595 TEST_ASSERT_SUCCESS(status); 1596 TEST_ASSERT(testFile != NULL); 1597 } 1598 if (U_FAILURE(status) || testFile == NULL) { 1599 return; /* something went wrong, error already output */ 1600 } 1601 UnicodeString testFileAsString(TRUE, testFile, len); 1602 1603 // 1604 // Parse the test data file using a regular expression. 1605 // Each kind of token is recognized in its own capture group; what type of item was scanned 1606 // is identified by which group had a match. 1607 // 1608 // Caputure Group # 1 2 3 4 5 1609 // Parses this item: divide x hex digits comment \n unrecognized \n 1610 // 1611 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV); 1612 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status); 1613 UnicodeString testString; 1614 UVector32 breakPositions(status); 1615 int lineNumber = 1; 1616 TEST_ASSERT_SUCCESS(status); 1617 if (U_FAILURE(status)) { 1618 return; 1619 } 1620 1621 // 1622 // Scan through each test case, building up the string to be broken in testString, 1623 // and the positions that should be boundaries in the breakPositions vector. 1624 // 1625 int spin = 0; 1626 while (tokenMatcher.find()) { 1627 if(tokenMatcher.hitEnd()) { 1628 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for. 1629 This occurred when the text file was corrupt (wasn't marked as UTF-8) 1630 and caused an infinite loop here on EBCDIC systems! 1631 */ 1632 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin); 1633 // return; 1634 } 1635 if (tokenMatcher.start(1, status) >= 0) { 1636 // Scanned a divide sign, indicating a break position in the test data. 1637 if (testString.length()>0) { 1638 breakPositions.addElement(testString.length(), status); 1639 } 1640 } 1641 else if (tokenMatcher.start(2, status) >= 0) { 1642 // Scanned an 'x', meaning no break at this position in the test data 1643 // Nothing to be done here. 1644 } 1645 else if (tokenMatcher.start(3, status) >= 0) { 1646 // Scanned Hex digits. Convert them to binary, append to the character data string. 1647 const UnicodeString &hexNumber = tokenMatcher.group(3, status); 1648 int length = hexNumber.length(); 1649 if (length<=8) { 1650 char buf[10]; 1651 hexNumber.extract (0, length, buf, sizeof(buf), US_INV); 1652 UChar32 c = (UChar32)strtol(buf, NULL, 16); 1653 if (c<=0x10ffff) { 1654 testString.append(c); 1655 } else { 1656 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n", 1657 fileName, lineNumber); 1658 } 1659 } else { 1660 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n", 1661 fileName, lineNumber); 1662 } 1663 } 1664 else if (tokenMatcher.start(4, status) >= 0) { 1665 // Scanned to end of a line, possibly skipping over a comment in the process. 1666 // If the line from the file contained test data, run the test now. 1667 // 1668 if (testString.length() > 0) { 1669 // TODO(andy): Remove this time bomb code. Note: Failing line numbers may change when updating to new Unicode data. 1670 // Rule 8 1671 // ZW SP* <break> 1672 // is not yet implemented. 1673 if (!(isLineBreak && !isTicket7270Fixed && (5198 == lineNumber || 1674 5202 == lineNumber || 1675 5214 == lineNumber || 1676 5246 == lineNumber || 1677 5298 == lineNumber || 1678 5302 == lineNumber ))) { 1679 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi); 1680 } 1681 } 1682 1683 // Clear out this test case. 1684 // The string and breakPositions vector will be refilled as the next 1685 // test case is parsed. 1686 testString.remove(); 1687 breakPositions.removeAllElements(); 1688 lineNumber++; 1689 } else { 1690 // Scanner catchall. Something unrecognized appeared on the line. 1691 char token[16]; 1692 UnicodeString uToken = tokenMatcher.group(0, status); 1693 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token)); 1694 token[sizeof(token)-1] = 0; 1695 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token); 1696 1697 // Clean up, in preparation for continuing with the next line. 1698 testString.remove(); 1699 breakPositions.removeAllElements(); 1700 lineNumber++; 1701 } 1702 TEST_ASSERT_SUCCESS(status); 1703 if (U_FAILURE(status)) { 1704 break; 1705 } 1706 } 1707 1708 delete [] testFile; 1709 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 1710 } 1711 1712 //-------------------------------------------------------------------------------------------- 1713 // 1714 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium 1715 // test data files. Do only a simple, forward-only check - 1716 // this test is mostly to check that ICU and the Unicode 1717 // data agree with each other. 1718 // 1719 //-------------------------------------------------------------------------------------------- 1720 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber, 1721 const UnicodeString &testString, // Text data to be broken 1722 UVector32 *breakPositions, // Positions where breaks should be found. 1723 RuleBasedBreakIterator *bi) { 1724 int32_t pos; // Break Position in the test string 1725 int32_t expectedI = 0; // Index of expected break position in the vector of expected results. 1726 int32_t expectedPos; // Expected break position (index into test string) 1727 1728 bi->setText(testString); 1729 pos = bi->first(); 1730 pos = bi->next(); 1731 1732 while (pos != BreakIterator::DONE) { 1733 if (expectedI >= breakPositions->size()) { 1734 errln("Test file \"%s\", line %d, unexpected break found at position %d", 1735 testFileName, lineNumber, pos); 1736 break; 1737 } 1738 expectedPos = breakPositions->elementAti(expectedI); 1739 if (pos < expectedPos) { 1740 errln("Test file \"%s\", line %d, unexpected break found at position %d", 1741 testFileName, lineNumber, pos); 1742 break; 1743 } 1744 if (pos > expectedPos) { 1745 errln("Test file \"%s\", line %d, failed to find expected break at position %d", 1746 testFileName, lineNumber, expectedPos); 1747 break; 1748 } 1749 pos = bi->next(); 1750 expectedI++; 1751 } 1752 1753 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) { 1754 errln("Test file \"%s\", line %d, failed to find expected break at position %d", 1755 testFileName, lineNumber, breakPositions->elementAti(expectedI)); 1756 } 1757 } 1758 1759 1760 1761 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 1762 //--------------------------------------------------------------------------------------- 1763 // 1764 // classs RBBIMonkeyKind 1765 // 1766 // Monkey Test for Break Iteration 1767 // Abstract interface class. Concrete derived classes independently 1768 // implement the break rules for different iterator types. 1769 // 1770 // The Monkey Test itself uses doesn't know which type of break iterator it is 1771 // testing, but works purely in terms of the interface defined here. 1772 // 1773 //--------------------------------------------------------------------------------------- 1774 class RBBIMonkeyKind { 1775 public: 1776 // Return a UVector of UnicodeSets, representing the character classes used 1777 // for this type of iterator. 1778 virtual UVector *charClasses() = 0; 1779 1780 // Set the test text on which subsequent calls to next() will operate 1781 virtual void setText(const UnicodeString &s) = 0; 1782 1783 // Find the next break postion, starting from the prev break position, or from zero. 1784 // Return -1 after reaching end of string. 1785 virtual int32_t next(int32_t i) = 0; 1786 1787 virtual ~RBBIMonkeyKind(); 1788 UErrorCode deferredStatus; 1789 1790 1791 protected: 1792 RBBIMonkeyKind(); 1793 1794 private: 1795 }; 1796 1797 RBBIMonkeyKind::RBBIMonkeyKind() { 1798 deferredStatus = U_ZERO_ERROR; 1799 } 1800 1801 RBBIMonkeyKind::~RBBIMonkeyKind() { 1802 } 1803 1804 1805 //---------------------------------------------------------------------------------------- 1806 // 1807 // Random Numbers. Similar to standard lib rand() and srand() 1808 // Not using library to 1809 // 1. Get same results on all platforms. 1810 // 2. Get access to current seed, to more easily reproduce failures. 1811 // 1812 //--------------------------------------------------------------------------------------- 1813 static uint32_t m_seed = 1; 1814 1815 static uint32_t m_rand() 1816 { 1817 m_seed = m_seed * 1103515245 + 12345; 1818 return (uint32_t)(m_seed/65536) % 32768; 1819 } 1820 1821 1822 //------------------------------------------------------------------------------------------ 1823 // 1824 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation 1825 // of RBBIMonkeyKind. 1826 // 1827 //------------------------------------------------------------------------------------------ 1828 class RBBICharMonkey: public RBBIMonkeyKind { 1829 public: 1830 RBBICharMonkey(); 1831 virtual ~RBBICharMonkey(); 1832 virtual UVector *charClasses(); 1833 virtual void setText(const UnicodeString &s); 1834 virtual int32_t next(int32_t i); 1835 private: 1836 UVector *fSets; 1837 1838 UnicodeSet *fCRLFSet; 1839 UnicodeSet *fControlSet; 1840 UnicodeSet *fExtendSet; 1841 UnicodeSet *fRegionalIndicatorSet; 1842 UnicodeSet *fPrependSet; 1843 UnicodeSet *fSpacingSet; 1844 UnicodeSet *fLSet; 1845 UnicodeSet *fVSet; 1846 UnicodeSet *fTSet; 1847 UnicodeSet *fLVSet; 1848 UnicodeSet *fLVTSet; 1849 UnicodeSet *fHangulSet; 1850 UnicodeSet *fAnySet; 1851 1852 const UnicodeString *fText; 1853 }; 1854 1855 1856 RBBICharMonkey::RBBICharMonkey() { 1857 UErrorCode status = U_ZERO_ERROR; 1858 1859 fText = NULL; 1860 1861 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status); 1862 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status); 1863 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status); 1864 fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status); 1865 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status); 1866 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status); 1867 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status); 1868 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status); 1869 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status); 1870 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status); 1871 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status); 1872 fHangulSet = new UnicodeSet(); 1873 fHangulSet->addAll(*fLSet); 1874 fHangulSet->addAll(*fVSet); 1875 fHangulSet->addAll(*fTSet); 1876 fHangulSet->addAll(*fLVSet); 1877 fHangulSet->addAll(*fLVTSet); 1878 fAnySet = new UnicodeSet(0, 0x10ffff); 1879 1880 fSets = new UVector(status); 1881 fSets->addElement(fCRLFSet, status); 1882 fSets->addElement(fControlSet, status); 1883 fSets->addElement(fExtendSet, status); 1884 fSets->addElement(fRegionalIndicatorSet, status); 1885 if (!fPrependSet->isEmpty()) { 1886 fSets->addElement(fPrependSet, status); 1887 } 1888 fSets->addElement(fSpacingSet, status); 1889 fSets->addElement(fHangulSet, status); 1890 fSets->addElement(fAnySet, status); 1891 if (U_FAILURE(status)) { 1892 deferredStatus = status; 1893 } 1894 } 1895 1896 1897 void RBBICharMonkey::setText(const UnicodeString &s) { 1898 fText = &s; 1899 } 1900 1901 1902 1903 int32_t RBBICharMonkey::next(int32_t prevPos) { 1904 int p0, p1, p2, p3; // Indices of the significant code points around the 1905 // break position being tested. The candidate break 1906 // location is before p2. 1907 1908 int breakPos = -1; 1909 1910 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 1911 1912 if (U_FAILURE(deferredStatus)) { 1913 return -1; 1914 } 1915 1916 // Previous break at end of string. return DONE. 1917 if (prevPos >= fText->length()) { 1918 return -1; 1919 } 1920 p0 = p1 = p2 = p3 = prevPos; 1921 c3 = fText->char32At(prevPos); 1922 c0 = c1 = c2 = 0; 1923 1924 // Loop runs once per "significant" character position in the input text. 1925 for (;;) { 1926 // Move all of the positions forward in the input string. 1927 p0 = p1; c0 = c1; 1928 p1 = p2; c1 = c2; 1929 p2 = p3; c2 = c3; 1930 1931 // Advancd p3 by one codepoint 1932 p3 = fText->moveIndex32(p3, 1); 1933 c3 = fText->char32At(p3); 1934 1935 if (p1 == p2) { 1936 // Still warming up the loop. (won't work with zero length strings, but we don't care) 1937 continue; 1938 } 1939 if (p2 == fText->length()) { 1940 // Reached end of string. Always a break position. 1941 break; 1942 } 1943 1944 // Rule GB3 CR x LF 1945 // No Extend or Format characters may appear between the CR and LF, 1946 // which requires the additional check for p2 immediately following p1. 1947 // 1948 if (c1==0x0D && c2==0x0A && p1==(p2-1)) { 1949 continue; 1950 } 1951 1952 // Rule (GB4). ( Control | CR | LF ) <break> 1953 if (fControlSet->contains(c1) || 1954 c1 == 0x0D || 1955 c1 == 0x0A) { 1956 break; 1957 } 1958 1959 // Rule (GB5) <break> ( Control | CR | LF ) 1960 // 1961 if (fControlSet->contains(c2) || 1962 c2 == 0x0D || 1963 c2 == 0x0A) { 1964 break; 1965 } 1966 1967 1968 // Rule (GB6) L x ( L | V | LV | LVT ) 1969 if (fLSet->contains(c1) && 1970 (fLSet->contains(c2) || 1971 fVSet->contains(c2) || 1972 fLVSet->contains(c2) || 1973 fLVTSet->contains(c2))) { 1974 continue; 1975 } 1976 1977 // Rule (GB7) ( LV | V ) x ( V | T ) 1978 if ((fLVSet->contains(c1) || fVSet->contains(c1)) && 1979 (fVSet->contains(c2) || fTSet->contains(c2))) { 1980 continue; 1981 } 1982 1983 // Rule (GB8) ( LVT | T) x T 1984 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) && 1985 fTSet->contains(c2)) { 1986 continue; 1987 } 1988 1989 // Rule (GB8a) Regional_Indicator x Regional_Indicator 1990 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) { 1991 continue; 1992 } 1993 1994 // Rule (GB9) Numeric x ALetter 1995 if (fExtendSet->contains(c2)) { 1996 continue; 1997 } 1998 1999 // Rule (GB9a) x SpacingMark 2000 if (fSpacingSet->contains(c2)) { 2001 continue; 2002 } 2003 2004 // Rule (GB9b) Prepend x 2005 if (fPrependSet->contains(c1)) { 2006 continue; 2007 } 2008 2009 // Rule (GB10) Any <break> Any 2010 break; 2011 } 2012 2013 breakPos = p2; 2014 return breakPos; 2015 } 2016 2017 2018 2019 UVector *RBBICharMonkey::charClasses() { 2020 return fSets; 2021 } 2022 2023 2024 RBBICharMonkey::~RBBICharMonkey() { 2025 delete fSets; 2026 delete fCRLFSet; 2027 delete fControlSet; 2028 delete fExtendSet; 2029 delete fRegionalIndicatorSet; 2030 delete fPrependSet; 2031 delete fSpacingSet; 2032 delete fLSet; 2033 delete fVSet; 2034 delete fTSet; 2035 delete fLVSet; 2036 delete fLVTSet; 2037 delete fHangulSet; 2038 delete fAnySet; 2039 } 2040 2041 //------------------------------------------------------------------------------------------ 2042 // 2043 // class RBBIWordMonkey Word Break specific implementation 2044 // of RBBIMonkeyKind. 2045 // 2046 //------------------------------------------------------------------------------------------ 2047 class RBBIWordMonkey: public RBBIMonkeyKind { 2048 public: 2049 RBBIWordMonkey(); 2050 virtual ~RBBIWordMonkey(); 2051 virtual UVector *charClasses(); 2052 virtual void setText(const UnicodeString &s); 2053 virtual int32_t next(int32_t i); 2054 private: 2055 UVector *fSets; 2056 2057 UnicodeSet *fCRSet; 2058 UnicodeSet *fLFSet; 2059 UnicodeSet *fNewlineSet; 2060 UnicodeSet *fKatakanaSet; 2061 UnicodeSet *fALetterSet; 2062 // TODO(jungshik): Do we still need this change? 2063 // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt 2064 UnicodeSet *fMidNumLetSet; 2065 UnicodeSet *fMidLetterSet; 2066 UnicodeSet *fMidNumSet; 2067 UnicodeSet *fNumericSet; 2068 UnicodeSet *fFormatSet; 2069 UnicodeSet *fOtherSet; 2070 UnicodeSet *fExtendSet; 2071 UnicodeSet *fExtendNumLetSet; 2072 UnicodeSet *fRegionalIndicatorSet; 2073 UnicodeSet *fDictionaryCjkSet; 2074 2075 RegexMatcher *fMatcher; 2076 2077 const UnicodeString *fText; 2078 }; 2079 2080 2081 RBBIWordMonkey::RBBIWordMonkey() 2082 { 2083 UErrorCode status = U_ZERO_ERROR; 2084 2085 fSets = new UVector(status); 2086 2087 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status); 2088 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status); 2089 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status); 2090 fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status); 2091 // Exclude Hangul syllables from ALetterSet during testing. 2092 // Leave CJK dictionary characters out from the monkey tests! 2093 #if 0 2094 fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}" 2095 "[\\p{Line_Break = Complex_Context}" 2096 "-\\p{Grapheme_Cluster_Break = Extend}" 2097 "-\\p{Grapheme_Cluster_Break = Control}" 2098 "]]", 2099 status); 2100 #endif 2101 fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status); 2102 fALetterSet->removeAll(*fDictionaryCjkSet); 2103 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status); 2104 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status); 2105 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status); 2106 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status); 2107 // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test 2108 // we should figure out why 2109 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status); 2110 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status); 2111 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status); 2112 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status); 2113 fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status); 2114 2115 fOtherSet = new UnicodeSet(); 2116 if(U_FAILURE(status)) { 2117 deferredStatus = status; 2118 return; 2119 } 2120 2121 fOtherSet->complement(); 2122 fOtherSet->removeAll(*fCRSet); 2123 fOtherSet->removeAll(*fLFSet); 2124 fOtherSet->removeAll(*fNewlineSet); 2125 fOtherSet->removeAll(*fKatakanaSet); 2126 fOtherSet->removeAll(*fALetterSet); 2127 fOtherSet->removeAll(*fMidLetterSet); 2128 fOtherSet->removeAll(*fMidNumSet); 2129 fOtherSet->removeAll(*fNumericSet); 2130 fOtherSet->removeAll(*fExtendNumLetSet); 2131 fOtherSet->removeAll(*fFormatSet); 2132 fOtherSet->removeAll(*fExtendSet); 2133 fOtherSet->removeAll(*fRegionalIndicatorSet); 2134 // Inhibit dictionary characters from being tested at all. 2135 fOtherSet->removeAll(*fDictionaryCjkSet); 2136 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status)); 2137 2138 fSets->addElement(fCRSet, status); 2139 fSets->addElement(fLFSet, status); 2140 fSets->addElement(fNewlineSet, status); 2141 fSets->addElement(fALetterSet, status); 2142 //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test katakana 2143 fSets->addElement(fMidLetterSet, status); 2144 fSets->addElement(fMidNumLetSet, status); 2145 fSets->addElement(fMidNumSet, status); 2146 fSets->addElement(fNumericSet, status); 2147 fSets->addElement(fFormatSet, status); 2148 fSets->addElement(fExtendSet, status); 2149 fSets->addElement(fOtherSet, status); 2150 fSets->addElement(fExtendNumLetSet, status); 2151 fSets->addElement(fRegionalIndicatorSet, status); 2152 2153 if (U_FAILURE(status)) { 2154 deferredStatus = status; 2155 } 2156 } 2157 2158 void RBBIWordMonkey::setText(const UnicodeString &s) { 2159 fText = &s; 2160 } 2161 2162 2163 int32_t RBBIWordMonkey::next(int32_t prevPos) { 2164 int p0, p1, p2, p3; // Indices of the significant code points around the 2165 // break position being tested. The candidate break 2166 // location is before p2. 2167 2168 int breakPos = -1; 2169 2170 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 2171 2172 if (U_FAILURE(deferredStatus)) { 2173 return -1; 2174 } 2175 2176 // Prev break at end of string. return DONE. 2177 if (prevPos >= fText->length()) { 2178 return -1; 2179 } 2180 p0 = p1 = p2 = p3 = prevPos; 2181 c3 = fText->char32At(prevPos); 2182 c0 = c1 = c2 = 0; 2183 2184 // Loop runs once per "significant" character position in the input text. 2185 for (;;) { 2186 // Move all of the positions forward in the input string. 2187 p0 = p1; c0 = c1; 2188 p1 = p2; c1 = c2; 2189 p2 = p3; c2 = c3; 2190 2191 // Advancd p3 by X(Extend | Format)* Rule 4 2192 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change) 2193 do { 2194 p3 = fText->moveIndex32(p3, 1); 2195 c3 = fText->char32At(p3); 2196 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { 2197 break; 2198 }; 2199 } 2200 while (fFormatSet->contains(c3) || fExtendSet->contains(c3)); 2201 2202 2203 if (p1 == p2) { 2204 // Still warming up the loop. (won't work with zero length strings, but we don't care) 2205 continue; 2206 } 2207 if (p2 == fText->length()) { 2208 // Reached end of string. Always a break position. 2209 break; 2210 } 2211 2212 // Rule (3) CR x LF 2213 // No Extend or Format characters may appear between the CR and LF, 2214 // which requires the additional check for p2 immediately following p1. 2215 // 2216 if (c1==0x0D && c2==0x0A) { 2217 continue; 2218 } 2219 2220 // Rule (3a) Break before and after newlines (including CR and LF) 2221 // 2222 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) { 2223 break; 2224 }; 2225 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { 2226 break; 2227 }; 2228 2229 // Rule (5). ALetter x ALetter 2230 if (fALetterSet->contains(c1) && 2231 fALetterSet->contains(c2)) { 2232 continue; 2233 } 2234 2235 // Rule (6) ALetter x (MidLetter | MidNumLet) ALetter 2236 // 2237 if ( fALetterSet->contains(c1) && 2238 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2)) && 2239 fALetterSet->contains(c3)) { 2240 continue; 2241 } 2242 2243 2244 // Rule (7) ALetter (MidLetter | MidNumLet) x ALetter 2245 if (fALetterSet->contains(c0) && 2246 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1)) && 2247 fALetterSet->contains(c2)) { 2248 continue; 2249 } 2250 2251 // Rule (8) Numeric x Numeric 2252 if (fNumericSet->contains(c1) && 2253 fNumericSet->contains(c2)) { 2254 continue; 2255 } 2256 2257 // Rule (9) ALetter x Numeric 2258 if (fALetterSet->contains(c1) && 2259 fNumericSet->contains(c2)) { 2260 continue; 2261 } 2262 2263 // Rule (10) Numeric x ALetter 2264 if (fNumericSet->contains(c1) && 2265 fALetterSet->contains(c2)) { 2266 continue; 2267 } 2268 2269 // Rule (11) Numeric (MidNum | MidNumLet) x Numeric 2270 if (fNumericSet->contains(c0) && 2271 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1)) && 2272 fNumericSet->contains(c2)) { 2273 continue; 2274 } 2275 2276 // Rule (12) Numeric x (MidNum | MidNumLet) Numeric 2277 if (fNumericSet->contains(c1) && 2278 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2)) && 2279 fNumericSet->contains(c3)) { 2280 continue; 2281 } 2282 2283 // Rule (13) Katakana x Katakana 2284 if (fKatakanaSet->contains(c1) && 2285 fKatakanaSet->contains(c2)) { 2286 continue; 2287 } 2288 2289 // Rule 13a 2290 if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) || 2291 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) && 2292 fExtendNumLetSet->contains(c2)) { 2293 continue; 2294 } 2295 2296 // Rule 13b 2297 if (fExtendNumLetSet->contains(c1) && 2298 (fALetterSet->contains(c2) || fNumericSet->contains(c2) || 2299 fKatakanaSet->contains(c2))) { 2300 continue; 2301 } 2302 2303 // Rule 13c 2304 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) { 2305 continue; 2306 } 2307 2308 // Rule 14. Break found here. 2309 break; 2310 } 2311 2312 breakPos = p2; 2313 return breakPos; 2314 } 2315 2316 2317 UVector *RBBIWordMonkey::charClasses() { 2318 return fSets; 2319 } 2320 2321 2322 RBBIWordMonkey::~RBBIWordMonkey() { 2323 delete fSets; 2324 delete fCRSet; 2325 delete fLFSet; 2326 delete fNewlineSet; 2327 delete fKatakanaSet; 2328 delete fALetterSet; 2329 delete fMidNumLetSet; 2330 delete fMidLetterSet; 2331 delete fMidNumSet; 2332 delete fNumericSet; 2333 delete fFormatSet; 2334 delete fExtendSet; 2335 delete fExtendNumLetSet; 2336 delete fRegionalIndicatorSet; 2337 delete fDictionaryCjkSet; 2338 delete fOtherSet; 2339 } 2340 2341 2342 2343 2344 //------------------------------------------------------------------------------------------ 2345 // 2346 // class RBBISentMonkey Sentence Break specific implementation 2347 // of RBBIMonkeyKind. 2348 // 2349 //------------------------------------------------------------------------------------------ 2350 class RBBISentMonkey: public RBBIMonkeyKind { 2351 public: 2352 RBBISentMonkey(); 2353 virtual ~RBBISentMonkey(); 2354 virtual UVector *charClasses(); 2355 virtual void setText(const UnicodeString &s); 2356 virtual int32_t next(int32_t i); 2357 private: 2358 int moveBack(int posFrom); 2359 int moveForward(int posFrom); 2360 UChar32 cAt(int pos); 2361 2362 UVector *fSets; 2363 2364 UnicodeSet *fSepSet; 2365 UnicodeSet *fFormatSet; 2366 UnicodeSet *fSpSet; 2367 UnicodeSet *fLowerSet; 2368 UnicodeSet *fUpperSet; 2369 UnicodeSet *fOLetterSet; 2370 UnicodeSet *fNumericSet; 2371 UnicodeSet *fATermSet; 2372 UnicodeSet *fSContinueSet; 2373 UnicodeSet *fSTermSet; 2374 UnicodeSet *fCloseSet; 2375 UnicodeSet *fOtherSet; 2376 UnicodeSet *fExtendSet; 2377 2378 const UnicodeString *fText; 2379 2380 }; 2381 2382 RBBISentMonkey::RBBISentMonkey() 2383 { 2384 UErrorCode status = U_ZERO_ERROR; 2385 2386 fSets = new UVector(status); 2387 2388 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator 2389 // set and made into character classes of their own. For the monkey impl, 2390 // they remain in SEP, since Sep always appears with CR and LF in the rules. 2391 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status); 2392 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status); 2393 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status); 2394 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status); 2395 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status); 2396 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status); 2397 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status); 2398 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status); 2399 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status); 2400 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status); 2401 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status); 2402 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status); 2403 fOtherSet = new UnicodeSet(); 2404 2405 if(U_FAILURE(status)) { 2406 deferredStatus = status; 2407 return; 2408 } 2409 2410 fOtherSet->complement(); 2411 fOtherSet->removeAll(*fSepSet); 2412 fOtherSet->removeAll(*fFormatSet); 2413 fOtherSet->removeAll(*fSpSet); 2414 fOtherSet->removeAll(*fLowerSet); 2415 fOtherSet->removeAll(*fUpperSet); 2416 fOtherSet->removeAll(*fOLetterSet); 2417 fOtherSet->removeAll(*fNumericSet); 2418 fOtherSet->removeAll(*fATermSet); 2419 fOtherSet->removeAll(*fSContinueSet); 2420 fOtherSet->removeAll(*fSTermSet); 2421 fOtherSet->removeAll(*fCloseSet); 2422 fOtherSet->removeAll(*fExtendSet); 2423 2424 fSets->addElement(fSepSet, status); 2425 fSets->addElement(fFormatSet, status); 2426 fSets->addElement(fSpSet, status); 2427 fSets->addElement(fLowerSet, status); 2428 fSets->addElement(fUpperSet, status); 2429 fSets->addElement(fOLetterSet, status); 2430 fSets->addElement(fNumericSet, status); 2431 fSets->addElement(fATermSet, status); 2432 fSets->addElement(fSContinueSet, status); 2433 fSets->addElement(fSTermSet, status); 2434 fSets->addElement(fCloseSet, status); 2435 fSets->addElement(fOtherSet, status); 2436 fSets->addElement(fExtendSet, status); 2437 2438 if (U_FAILURE(status)) { 2439 deferredStatus = status; 2440 } 2441 } 2442 2443 2444 2445 void RBBISentMonkey::setText(const UnicodeString &s) { 2446 fText = &s; 2447 } 2448 2449 UVector *RBBISentMonkey::charClasses() { 2450 return fSets; 2451 } 2452 2453 2454 // moveBack() Find the "significant" code point preceding the index i. 2455 // Skips over ($Extend | $Format)* . 2456 // 2457 int RBBISentMonkey::moveBack(int i) { 2458 if (i <= 0) { 2459 return -1; 2460 } 2461 UChar32 c; 2462 int32_t j = i; 2463 do { 2464 j = fText->moveIndex32(j, -1); 2465 c = fText->char32At(j); 2466 } 2467 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c))); 2468 return j; 2469 2470 } 2471 2472 2473 int RBBISentMonkey::moveForward(int i) { 2474 if (i>=fText->length()) { 2475 return fText->length(); 2476 } 2477 UChar32 c; 2478 int32_t j = i; 2479 do { 2480 j = fText->moveIndex32(j, 1); 2481 c = cAt(j); 2482 } 2483 while (fFormatSet->contains(c) || fExtendSet->contains(c)); 2484 return j; 2485 } 2486 2487 UChar32 RBBISentMonkey::cAt(int pos) { 2488 if (pos<0 || pos>=fText->length()) { 2489 return -1; 2490 } else { 2491 return fText->char32At(pos); 2492 } 2493 } 2494 2495 int32_t RBBISentMonkey::next(int32_t prevPos) { 2496 int p0, p1, p2, p3; // Indices of the significant code points around the 2497 // break position being tested. The candidate break 2498 // location is before p2. 2499 2500 int breakPos = -1; 2501 2502 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 2503 UChar32 c; 2504 2505 if (U_FAILURE(deferredStatus)) { 2506 return -1; 2507 } 2508 2509 // Prev break at end of string. return DONE. 2510 if (prevPos >= fText->length()) { 2511 return -1; 2512 } 2513 p0 = p1 = p2 = p3 = prevPos; 2514 c3 = fText->char32At(prevPos); 2515 c0 = c1 = c2 = 0; 2516 2517 // Loop runs once per "significant" character position in the input text. 2518 for (;;) { 2519 // Move all of the positions forward in the input string. 2520 p0 = p1; c0 = c1; 2521 p1 = p2; c1 = c2; 2522 p2 = p3; c2 = c3; 2523 2524 // Advancd p3 by X(Extend | Format)* Rule 4 2525 p3 = moveForward(p3); 2526 c3 = cAt(p3); 2527 2528 // Rule (3) CR x LF 2529 if (c1==0x0d && c2==0x0a && p2==(p1+1)) { 2530 continue; 2531 } 2532 2533 // Rule (4). Sep <break> 2534 if (fSepSet->contains(c1)) { 2535 p2 = p1+1; // Separators don't combine with Extend or Format. 2536 break; 2537 } 2538 2539 if (p2 >= fText->length()) { 2540 // Reached end of string. Always a break position. 2541 break; 2542 } 2543 2544 if (p2 == prevPos) { 2545 // Still warming up the loop. (won't work with zero length strings, but we don't care) 2546 continue; 2547 } 2548 2549 // Rule (6). ATerm x Numeric 2550 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) { 2551 continue; 2552 } 2553 2554 // Rule (7). Upper ATerm x Uppper 2555 if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) { 2556 continue; 2557 } 2558 2559 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower 2560 // Note: STerm | ATerm are added to the negated part of the expression by a 2561 // note to the Unicode 5.0 documents. 2562 int p8 = p1; 2563 while (fSpSet->contains(cAt(p8))) { 2564 p8 = moveBack(p8); 2565 } 2566 while (fCloseSet->contains(cAt(p8))) { 2567 p8 = moveBack(p8); 2568 } 2569 if (fATermSet->contains(cAt(p8))) { 2570 p8=p2; 2571 for (;;) { 2572 c = cAt(p8); 2573 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) || 2574 fLowerSet->contains(c) || fSepSet->contains(c) || 2575 fATermSet->contains(c) || fSTermSet->contains(c)) { 2576 break; 2577 } 2578 p8 = moveForward(p8); 2579 } 2580 if (fLowerSet->contains(cAt(p8))) { 2581 continue; 2582 } 2583 } 2584 2585 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm); 2586 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) { 2587 p8 = p1; 2588 while (fSpSet->contains(cAt(p8))) { 2589 p8 = moveBack(p8); 2590 } 2591 while (fCloseSet->contains(cAt(p8))) { 2592 p8 = moveBack(p8); 2593 } 2594 c = cAt(p8); 2595 if (fSTermSet->contains(c) || fATermSet->contains(c)) { 2596 continue; 2597 } 2598 } 2599 2600 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF) 2601 int p9 = p1; 2602 while (fCloseSet->contains(cAt(p9))) { 2603 p9 = moveBack(p9); 2604 } 2605 c = cAt(p9); 2606 if ((fSTermSet->contains(c) || fATermSet->contains(c))) { 2607 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) { 2608 continue; 2609 } 2610 } 2611 2612 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF) 2613 int p10 = p1; 2614 while (fSpSet->contains(cAt(p10))) { 2615 p10 = moveBack(p10); 2616 } 2617 while (fCloseSet->contains(cAt(p10))) { 2618 p10 = moveBack(p10); 2619 } 2620 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) { 2621 if (fSpSet->contains(c2) || fSepSet->contains(c2)) { 2622 continue; 2623 } 2624 } 2625 2626 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break> 2627 int p11 = p1; 2628 if (fSepSet->contains(cAt(p11))) { 2629 p11 = moveBack(p11); 2630 } 2631 while (fSpSet->contains(cAt(p11))) { 2632 p11 = moveBack(p11); 2633 } 2634 while (fCloseSet->contains(cAt(p11))) { 2635 p11 = moveBack(p11); 2636 } 2637 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) { 2638 break; 2639 } 2640 2641 // Rule (12) Any x Any 2642 continue; 2643 } 2644 breakPos = p2; 2645 return breakPos; 2646 } 2647 2648 RBBISentMonkey::~RBBISentMonkey() { 2649 delete fSets; 2650 delete fSepSet; 2651 delete fFormatSet; 2652 delete fSpSet; 2653 delete fLowerSet; 2654 delete fUpperSet; 2655 delete fOLetterSet; 2656 delete fNumericSet; 2657 delete fATermSet; 2658 delete fSContinueSet; 2659 delete fSTermSet; 2660 delete fCloseSet; 2661 delete fOtherSet; 2662 delete fExtendSet; 2663 } 2664 2665 2666 2667 //------------------------------------------------------------------------------------------- 2668 // 2669 // RBBILineMonkey 2670 // 2671 //------------------------------------------------------------------------------------------- 2672 2673 class RBBILineMonkey: public RBBIMonkeyKind { 2674 public: 2675 RBBILineMonkey(); 2676 virtual ~RBBILineMonkey(); 2677 virtual UVector *charClasses(); 2678 virtual void setText(const UnicodeString &s); 2679 virtual int32_t next(int32_t i); 2680 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar); 2681 private: 2682 UVector *fSets; 2683 2684 UnicodeSet *fBK; 2685 UnicodeSet *fCR; 2686 UnicodeSet *fLF; 2687 UnicodeSet *fCM; 2688 UnicodeSet *fNL; 2689 UnicodeSet *fSG; 2690 UnicodeSet *fWJ; 2691 UnicodeSet *fZW; 2692 UnicodeSet *fGL; 2693 UnicodeSet *fCB; 2694 UnicodeSet *fSP; 2695 UnicodeSet *fB2; 2696 UnicodeSet *fBA; 2697 UnicodeSet *fBB; 2698 UnicodeSet *fHY; 2699 UnicodeSet *fH2; 2700 UnicodeSet *fH3; 2701 UnicodeSet *fCL; 2702 UnicodeSet *fCP; 2703 UnicodeSet *fEX; 2704 UnicodeSet *fIN; 2705 UnicodeSet *fJL; 2706 UnicodeSet *fJV; 2707 UnicodeSet *fJT; 2708 UnicodeSet *fNS; 2709 UnicodeSet *fOP; 2710 UnicodeSet *fQU; 2711 UnicodeSet *fIS; 2712 UnicodeSet *fNU; 2713 UnicodeSet *fPO; 2714 UnicodeSet *fPR; 2715 UnicodeSet *fSY; 2716 UnicodeSet *fAI; 2717 UnicodeSet *fAL; 2718 UnicodeSet *fCJ; 2719 UnicodeSet *fHL; 2720 UnicodeSet *fID; 2721 UnicodeSet *fRI; 2722 UnicodeSet *fSA; 2723 UnicodeSet *fXX; 2724 2725 BreakIterator *fCharBI; 2726 2727 const UnicodeString *fText; 2728 int32_t *fOrigPositions; 2729 2730 RegexMatcher *fNumberMatcher; 2731 RegexMatcher *fLB11Matcher; 2732 }; 2733 2734 2735 RBBILineMonkey::RBBILineMonkey() 2736 { 2737 UErrorCode status = U_ZERO_ERROR; 2738 2739 fSets = new UVector(status); 2740 2741 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status); 2742 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status); 2743 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status); 2744 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status); 2745 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status); 2746 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status); 2747 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status); 2748 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status); 2749 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status); 2750 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status); 2751 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status); 2752 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status); 2753 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status); 2754 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status); 2755 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status); 2756 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status); 2757 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status); 2758 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status); 2759 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status); 2760 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status); 2761 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status); 2762 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status); 2763 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status); 2764 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status); 2765 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status); 2766 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status); 2767 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status); 2768 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status); 2769 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status); 2770 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status); 2771 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status); 2772 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status); 2773 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status); 2774 fCJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status); 2775 fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status); 2776 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status); 2777 fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status); 2778 fSA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status); 2779 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status); 2780 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status); 2781 2782 if (U_FAILURE(status)) { 2783 deferredStatus = status; 2784 fCharBI = NULL; 2785 fNumberMatcher = NULL; 2786 return; 2787 } 2788 2789 fAL->addAll(*fXX); // Default behavior for XX is identical to AL 2790 fAL->addAll(*fAI); // Default behavior for AI is identical to AL 2791 fAL->addAll(*fSA); // Default behavior for SA is XX, which defaults to AL 2792 fAL->addAll(*fSG); // Default behavior for SG is identical to AL. 2793 2794 fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS. 2795 2796 fSets->addElement(fBK, status); 2797 fSets->addElement(fCR, status); 2798 fSets->addElement(fLF, status); 2799 fSets->addElement(fCM, status); 2800 fSets->addElement(fNL, status); 2801 fSets->addElement(fWJ, status); 2802 fSets->addElement(fZW, status); 2803 fSets->addElement(fGL, status); 2804 fSets->addElement(fCB, status); 2805 fSets->addElement(fSP, status); 2806 fSets->addElement(fB2, status); 2807 fSets->addElement(fBA, status); 2808 fSets->addElement(fBB, status); 2809 fSets->addElement(fHY, status); 2810 fSets->addElement(fH2, status); 2811 fSets->addElement(fH3, status); 2812 fSets->addElement(fCL, status); 2813 fSets->addElement(fCP, status); 2814 fSets->addElement(fEX, status); 2815 fSets->addElement(fIN, status); 2816 fSets->addElement(fJL, status); 2817 fSets->addElement(fJT, status); 2818 fSets->addElement(fJV, status); 2819 fSets->addElement(fNS, status); 2820 fSets->addElement(fOP, status); 2821 fSets->addElement(fQU, status); 2822 fSets->addElement(fIS, status); 2823 fSets->addElement(fNU, status); 2824 fSets->addElement(fPO, status); 2825 fSets->addElement(fPR, status); 2826 fSets->addElement(fSY, status); 2827 fSets->addElement(fAI, status); 2828 fSets->addElement(fAL, status); 2829 fSets->addElement(fHL, status); 2830 fSets->addElement(fID, status); 2831 fSets->addElement(fWJ, status); 2832 fSets->addElement(fRI, status); 2833 fSets->addElement(fSA, status); 2834 fSets->addElement(fSG, status); 2835 2836 const char *rules = 2837 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?" 2838 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?" 2839 "\\p{Line_Break=NU}\\p{Line_Break=CM}*" 2840 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*" 2841 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?" 2842 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"; 2843 2844 fNumberMatcher = new RegexMatcher( 2845 UnicodeString(rules, -1, US_INV), 0, status); 2846 2847 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status); 2848 2849 if (U_FAILURE(status)) { 2850 deferredStatus = status; 2851 } 2852 } 2853 2854 2855 void RBBILineMonkey::setText(const UnicodeString &s) { 2856 fText = &s; 2857 fCharBI->setText(s); 2858 fNumberMatcher->reset(s); 2859 } 2860 2861 // 2862 // rule9Adjust 2863 // Line Break TR rules 9 and 10 implementation. 2864 // This deals with combining marks and other sequences that 2865 // that must be treated as if they were something other than what they actually are. 2866 // 2867 // This is factored out into a separate function because it must be applied twice for 2868 // each potential break, once to the chars before the position being checked, then 2869 // again to the text following the possible break. 2870 // 2871 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) { 2872 if (pos == -1) { 2873 // Invalid initial position. Happens during the warmup iteration of the 2874 // main loop in next(). 2875 return; 2876 } 2877 2878 int32_t nPos = *nextPos; 2879 2880 // LB 9 Keep combining sequences together. 2881 // advance over any CM class chars. Note that Line Break CM is different 2882 // from the normal Grapheme Extend property. 2883 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d || 2884 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) { 2885 for (;;) { 2886 *nextChar = fText->char32At(nPos); 2887 if (!fCM->contains(*nextChar)) { 2888 break; 2889 } 2890 nPos = fText->moveIndex32(nPos, 1); 2891 } 2892 } 2893 2894 2895 // LB 9 Treat X CM* as if it were x. 2896 // No explicit action required. 2897 2898 // LB 10 Treat any remaining combining mark as AL 2899 if (fCM->contains(*posChar)) { 2900 *posChar = 0x41; // thisChar = 'A'; 2901 } 2902 2903 // Push the updated nextPos and nextChar back to our caller. 2904 // This only makes a difference if posChar got bigger by consuming a 2905 // combining sequence. 2906 *nextPos = nPos; 2907 *nextChar = fText->char32At(nPos); 2908 } 2909 2910 2911 2912 int32_t RBBILineMonkey::next(int32_t startPos) { 2913 UErrorCode status = U_ZERO_ERROR; 2914 int32_t pos; // Index of the char following a potential break position 2915 UChar32 thisChar; // Character at above position "pos" 2916 2917 int32_t prevPos; // Index of the char preceding a potential break position 2918 UChar32 prevChar; // Character at above position. Note that prevChar 2919 // and thisChar may not be adjacent because combining 2920 // characters between them will be ignored. 2921 2922 int32_t prevPosX2; // Second previous character. Wider context for LB21a. 2923 UChar32 prevCharX2; 2924 2925 int32_t nextPos; // Index of the next character following pos. 2926 // Usually skips over combining marks. 2927 int32_t nextCPPos; // Index of the code point following "pos." 2928 // May point to a combining mark. 2929 int32_t tPos; // temp value. 2930 UChar32 c; 2931 2932 if (U_FAILURE(deferredStatus)) { 2933 return -1; 2934 } 2935 2936 if (startPos >= fText->length()) { 2937 return -1; 2938 } 2939 2940 2941 // Initial values for loop. Loop will run the first time without finding breaks, 2942 // while the invalid values shift out and the "this" and 2943 // "prev" positions are filled in with good values. 2944 pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration. 2945 thisChar = prevChar = prevCharX2 = 0; 2946 nextPos = nextCPPos = startPos; 2947 2948 2949 // Loop runs once per position in the test text, until a break position 2950 // is found. 2951 for (;;) { 2952 prevPosX2 = prevPos; 2953 prevCharX2 = prevChar; 2954 2955 prevPos = pos; 2956 prevChar = thisChar; 2957 2958 pos = nextPos; 2959 thisChar = fText->char32At(pos); 2960 2961 nextCPPos = fText->moveIndex32(pos, 1); 2962 nextPos = nextCPPos; 2963 2964 // Rule LB2 - Break at end of text. 2965 if (pos >= fText->length()) { 2966 break; 2967 } 2968 2969 // Rule LB 9 - adjust for combining sequences. 2970 // We do this one out-of-order because the adjustment does not change anything 2971 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to 2972 // be applied. 2973 rule9Adjust(prevPos, &prevChar, &pos, &thisChar); 2974 nextCPPos = nextPos = fText->moveIndex32(pos, 1); 2975 c = fText->char32At(nextPos); 2976 rule9Adjust(pos, &thisChar, &nextPos, &c); 2977 2978 // If the loop is still warming up - if we haven't shifted the initial 2979 // -1 positions out of prevPos yet - loop back to advance the 2980 // position in the input without any further looking for breaks. 2981 if (prevPos == -1) { 2982 continue; 2983 } 2984 2985 // LB 4 Always break after hard line breaks, 2986 if (fBK->contains(prevChar)) { 2987 break; 2988 } 2989 2990 // LB 5 Break after CR, LF, NL, but not inside CR LF 2991 if (prevChar == 0x0d && thisChar == 0x0a) { 2992 continue; 2993 } 2994 if (prevChar == 0x0d || 2995 prevChar == 0x0a || 2996 prevChar == 0x85) { 2997 break; 2998 } 2999 3000 // LB 6 Don't break before hard line breaks 3001 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 || 3002 fBK->contains(thisChar)) { 3003 continue; 3004 } 3005 3006 3007 // LB 7 Don't break before spaces or zero-width space. 3008 if (fSP->contains(thisChar)) { 3009 continue; 3010 } 3011 3012 if (fZW->contains(thisChar)) { 3013 continue; 3014 } 3015 3016 // LB 8 Break after zero width space 3017 if (fZW->contains(prevChar)) { 3018 break; 3019 } 3020 3021 // LB 9, 10 Already done, at top of loop. 3022 // 3023 3024 3025 // LB 11 Do not break before or after WORD JOINER and related characters. 3026 // x WJ 3027 // WJ x 3028 // 3029 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) { 3030 continue; 3031 } 3032 3033 // LB 12 3034 // GL x 3035 if (fGL->contains(prevChar)) { 3036 continue; 3037 } 3038 3039 // LB 12a 3040 // [^SP BA HY] x GL 3041 if (!(fSP->contains(prevChar) || 3042 fBA->contains(prevChar) || 3043 fHY->contains(prevChar) ) && fGL->contains(thisChar)) { 3044 continue; 3045 } 3046 3047 3048 3049 // LB 13 Don't break before closings. 3050 // NU x CL, NU x CP and NU x IS are not matched here so that they will 3051 // fall into LB 17 and the more general number regular expression. 3052 // 3053 if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) || 3054 (!fNU->contains(prevChar) && fCP->contains(thisChar)) || 3055 fEX->contains(thisChar) || 3056 (!fNU->contains(prevChar) && fIS->contains(thisChar)) || 3057 (!fNU->contains(prevChar) && fSY->contains(thisChar))) { 3058 continue; 3059 } 3060 3061 // LB 14 Don't break after OP SP* 3062 // Scan backwards, checking for this sequence. 3063 // The OP char could include combining marks, so we actually check for 3064 // OP CM* SP* 3065 // Another Twist: The Rule 67 fixes may have changed a SP CM 3066 // sequence into a ID char, so before scanning back through spaces, 3067 // verify that prevChar is indeed a space. The prevChar variable 3068 // may differ from fText[prevPos] 3069 tPos = prevPos; 3070 if (fSP->contains(prevChar)) { 3071 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) { 3072 tPos=fText->moveIndex32(tPos, -1); 3073 } 3074 } 3075 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { 3076 tPos=fText->moveIndex32(tPos, -1); 3077 } 3078 if (fOP->contains(fText->char32At(tPos))) { 3079 continue; 3080 } 3081 3082 3083 // LB 15 QU SP* x OP 3084 if (fOP->contains(thisChar)) { 3085 // Scan backwards from prevChar to see if it is preceded by QU CM* SP* 3086 int tPos = prevPos; 3087 while (tPos>0 && fSP->contains(fText->char32At(tPos))) { 3088 tPos = fText->moveIndex32(tPos, -1); 3089 } 3090 while (tPos>0 && fCM->contains(fText->char32At(tPos))) { 3091 tPos = fText->moveIndex32(tPos, -1); 3092 } 3093 if (fQU->contains(fText->char32At(tPos))) { 3094 continue; 3095 } 3096 } 3097 3098 3099 3100 // LB 16 (CL | CP) SP* x NS 3101 // Scan backwards for SP* CM* (CL | CP) 3102 if (fNS->contains(thisChar)) { 3103 int tPos = prevPos; 3104 while (tPos>0 && fSP->contains(fText->char32At(tPos))) { 3105 tPos = fText->moveIndex32(tPos, -1); 3106 } 3107 while (tPos>0 && fCM->contains(fText->char32At(tPos))) { 3108 tPos = fText->moveIndex32(tPos, -1); 3109 } 3110 if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) { 3111 continue; 3112 } 3113 } 3114 3115 3116 // LB 17 B2 SP* x B2 3117 if (fB2->contains(thisChar)) { 3118 // Scan backwards, checking for the B2 CM* SP* sequence. 3119 tPos = prevPos; 3120 if (fSP->contains(prevChar)) { 3121 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) { 3122 tPos=fText->moveIndex32(tPos, -1); 3123 } 3124 } 3125 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { 3126 tPos=fText->moveIndex32(tPos, -1); 3127 } 3128 if (fB2->contains(fText->char32At(tPos))) { 3129 continue; 3130 } 3131 } 3132 3133 3134 // LB 18 break after space 3135 if (fSP->contains(prevChar)) { 3136 break; 3137 } 3138 3139 // LB 19 3140 // x QU 3141 // QU x 3142 if (fQU->contains(thisChar) || fQU->contains(prevChar)) { 3143 continue; 3144 } 3145 3146 // LB 20 Break around a CB 3147 if (fCB->contains(thisChar) || fCB->contains(prevChar)) { 3148 break; 3149 } 3150 3151 // LB 21 3152 if (fBA->contains(thisChar) || 3153 fHY->contains(thisChar) || 3154 fNS->contains(thisChar) || 3155 fBB->contains(prevChar) ) { 3156 continue; 3157 } 3158 3159 // LB 21a 3160 // HL (HY | BA) x 3161 if (fHL->contains(prevCharX2) && 3162 (fHY->contains(prevChar) || fBA->contains(prevChar))) { 3163 continue; 3164 } 3165 3166 // LB 22 3167 if ((fAL->contains(prevChar) && fIN->contains(thisChar)) || 3168 (fHL->contains(prevChar) && fIN->contains(thisChar)) || 3169 (fID->contains(prevChar) && fIN->contains(thisChar)) || 3170 (fIN->contains(prevChar) && fIN->contains(thisChar)) || 3171 (fNU->contains(prevChar) && fIN->contains(thisChar)) ) { 3172 continue; 3173 } 3174 3175 3176 // LB 23 ID x PO 3177 // AL x NU 3178 // HL x NU 3179 // NU x AL 3180 if ((fID->contains(prevChar) && fPO->contains(thisChar)) || 3181 (fAL->contains(prevChar) && fNU->contains(thisChar)) || 3182 (fHL->contains(prevChar) && fNU->contains(thisChar)) || 3183 (fNU->contains(prevChar) && fAL->contains(thisChar)) || 3184 (fNU->contains(prevChar) && fHL->contains(thisChar)) ) { 3185 continue; 3186 } 3187 3188 // LB 24 Do not break between prefix and letters or ideographs. 3189 // PR x ID 3190 // PR x (AL | HL) 3191 // PO x (AL | HL) 3192 if ((fPR->contains(prevChar) && fID->contains(thisChar)) || 3193 (fPR->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) || 3194 (fPO->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar)))) { 3195 continue; 3196 } 3197 3198 3199 3200 // LB 25 Numbers 3201 if (fNumberMatcher->lookingAt(prevPos, status)) { 3202 if (U_FAILURE(status)) { 3203 break; 3204 } 3205 // Matched a number. But could have been just a single digit, which would 3206 // not represent a "no break here" between prevChar and thisChar 3207 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num 3208 if (numEndIdx > pos) { 3209 // Number match includes at least our two chars being checked 3210 if (numEndIdx > nextPos) { 3211 // Number match includes additional chars. Update pos and nextPos 3212 // so that next loop iteration will continue at the end of the number, 3213 // checking for breaks between last char in number & whatever follows. 3214 pos = nextPos = numEndIdx; 3215 do { 3216 pos = fText->moveIndex32(pos, -1); 3217 thisChar = fText->char32At(pos); 3218 } while (fCM->contains(thisChar)); 3219 } 3220 continue; 3221 } 3222 } 3223 3224 3225 // LB 26 Do not break a Korean syllable. 3226 if (fJL->contains(prevChar) && (fJL->contains(thisChar) || 3227 fJV->contains(thisChar) || 3228 fH2->contains(thisChar) || 3229 fH3->contains(thisChar))) { 3230 continue; 3231 } 3232 3233 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) && 3234 (fJV->contains(thisChar) || fJT->contains(thisChar))) { 3235 continue; 3236 } 3237 3238 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) && 3239 fJT->contains(thisChar)) { 3240 continue; 3241 } 3242 3243 // LB 27 Treat a Korean Syllable Block the same as ID. 3244 if ((fJL->contains(prevChar) || fJV->contains(prevChar) || 3245 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) && 3246 fIN->contains(thisChar)) { 3247 continue; 3248 } 3249 if ((fJL->contains(prevChar) || fJV->contains(prevChar) || 3250 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) && 3251 fPO->contains(thisChar)) { 3252 continue; 3253 } 3254 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) || 3255 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) { 3256 continue; 3257 } 3258 3259 3260 3261 // LB 28 Do not break between alphabetics ("at"). 3262 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) { 3263 continue; 3264 } 3265 3266 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g."). 3267 if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) { 3268 continue; 3269 } 3270 3271 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation. 3272 // (AL | NU) x OP 3273 // CP x (AL | NU) 3274 if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) { 3275 continue; 3276 } 3277 if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) { 3278 continue; 3279 } 3280 3281 // LB30a Do not break between regional indicators. 3282 // RI x RI 3283 if (fRI->contains(prevChar) && fRI->contains(thisChar)) { 3284 continue; 3285 } 3286 3287 // LB 31 Break everywhere else 3288 break; 3289 3290 } 3291 3292 return pos; 3293 } 3294 3295 3296 UVector *RBBILineMonkey::charClasses() { 3297 return fSets; 3298 } 3299 3300 3301 RBBILineMonkey::~RBBILineMonkey() { 3302 delete fSets; 3303 3304 delete fBK; 3305 delete fCR; 3306 delete fLF; 3307 delete fCM; 3308 delete fNL; 3309 delete fWJ; 3310 delete fZW; 3311 delete fGL; 3312 delete fCB; 3313 delete fSP; 3314 delete fB2; 3315 delete fBA; 3316 delete fBB; 3317 delete fHY; 3318 delete fH2; 3319 delete fH3; 3320 delete fCL; 3321 delete fCP; 3322 delete fEX; 3323 delete fIN; 3324 delete fJL; 3325 delete fJV; 3326 delete fJT; 3327 delete fNS; 3328 delete fOP; 3329 delete fQU; 3330 delete fIS; 3331 delete fNU; 3332 delete fPO; 3333 delete fPR; 3334 delete fSY; 3335 delete fAI; 3336 delete fAL; 3337 delete fCJ; 3338 delete fHL; 3339 delete fID; 3340 delete fRI; 3341 delete fSA; 3342 delete fSG; 3343 delete fXX; 3344 3345 delete fCharBI; 3346 delete fNumberMatcher; 3347 } 3348 3349 3350 //------------------------------------------------------------------------------------------- 3351 // 3352 // TestMonkey 3353 // 3354 // params 3355 // seed=nnnnn Random number starting seed. 3356 // Setting the seed allows errors to be reproduced. 3357 // loop=nnn Looping count. Controls running time. 3358 // -1: run forever. 3359 // 0 or greater: run length. 3360 // 3361 // type = char | word | line | sent | title 3362 // 3363 //------------------------------------------------------------------------------------------- 3364 3365 static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) { 3366 int32_t val = defaultVal; 3367 name.append(" *= *(-?\\d+)"); 3368 UErrorCode status = U_ZERO_ERROR; 3369 RegexMatcher m(name, params, 0, status); 3370 if (m.find()) { 3371 // The param exists. Convert the string to an int. 3372 char valString[100]; 3373 int32_t paramLength = m.end(1, status) - m.start(1, status); 3374 if (paramLength >= (int32_t)(sizeof(valString)-1)) { 3375 paramLength = (int32_t)(sizeof(valString)-2); 3376 } 3377 params.extract(m.start(1, status), paramLength, valString, sizeof(valString)); 3378 val = strtol(valString, NULL, 10); 3379 3380 // Delete this parameter from the params string. 3381 m.reset(); 3382 params = m.replaceFirst("", status); 3383 } 3384 U_ASSERT(U_SUCCESS(status)); 3385 return val; 3386 } 3387 #endif 3388 3389 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 3390 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr, 3391 BreakIterator *bi, 3392 int expected[], 3393 int expectedcount) 3394 { 3395 int count = 0; 3396 int i = 0; 3397 int forward[50]; 3398 bi->setText(ustr); 3399 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) { 3400 forward[count] = i; 3401 if (count < expectedcount && expected[count] != i) { 3402 test->errln("break forward test failed: expected %d but got %d", 3403 expected[count], i); 3404 break; 3405 } 3406 count ++; 3407 } 3408 if (count != expectedcount) { 3409 printStringBreaks(ustr, expected, expectedcount); 3410 test->errln("break forward test failed: missed %d match", 3411 expectedcount - count); 3412 return; 3413 } 3414 // testing boundaries 3415 for (i = 1; i < expectedcount; i ++) { 3416 int j = expected[i - 1]; 3417 if (!bi->isBoundary(j)) { 3418 printStringBreaks(ustr, expected, expectedcount); 3419 test->errln("isBoundary() failed. Expected boundary at position %d", j); 3420 return; 3421 } 3422 for (j = expected[i - 1] + 1; j < expected[i]; j ++) { 3423 if (bi->isBoundary(j)) { 3424 printStringBreaks(ustr, expected, expectedcount); 3425 test->errln("isBoundary() failed. Not expecting boundary at position %d", j); 3426 return; 3427 } 3428 } 3429 } 3430 3431 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) { 3432 count --; 3433 if (forward[count] != i) { 3434 printStringBreaks(ustr, expected, expectedcount); 3435 test->errln("happy break test previous() failed: expected %d but got %d", 3436 forward[count], i); 3437 break; 3438 } 3439 } 3440 if (count != 0) { 3441 printStringBreaks(ustr, expected, expectedcount); 3442 test->errln("break test previous() failed: missed a match"); 3443 return; 3444 } 3445 3446 // testing preceding 3447 for (i = 0; i < expectedcount - 1; i ++) { 3448 // int j = expected[i] + 1; 3449 int j = ustr.moveIndex32(expected[i], 1); 3450 for (; j <= expected[i + 1]; j ++) { 3451 if (bi->preceding(j) != expected[i]) { 3452 printStringBreaks(ustr, expected, expectedcount); 3453 test->errln("preceding(): Not expecting boundary at position %d", j); 3454 return; 3455 } 3456 } 3457 } 3458 } 3459 #endif 3460 3461 void RBBITest::TestWordBreaks(void) 3462 { 3463 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 3464 3465 Locale locale("en"); 3466 UErrorCode status = U_ZERO_ERROR; 3467 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 3468 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 3469 // Replaced any C+J characters in a row with a random sequence of characters 3470 // of the same length to make our C+J segmentation not get in the way. 3471 static const char *strlist[] = 3472 { 3473 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d", 3474 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b", 3475 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a", 3476 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622", 3477 "\\uac00\\u3588\\u009c\\u0953\\u194b", 3478 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 3479 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e", 3480 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e", 3481 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 3482 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 3483 "\\u2027\\U000e0067\\u0a47\\u00b7", 3484 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 3485 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 3486 "\\u0589\\U000e006e\\u0a42\\U000104a5", 3487 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a", 3488 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 3489 "\\u0027\\u11af\\U000e0057\\u0602", 3490 "\\U0001d7f2\\U000e007\\u0004\\u0589", 3491 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b", 3492 "\\U0001d7f2\\U000e007d\\u0004\\u0589", 3493 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 3494 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 3495 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068", 3496 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 3497 "\\u0233\\U000e0020\\u0a69\\u0d6a", 3498 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 3499 "\\u18f4\\U000e0049\\u20e7\\u2027", 3500 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 3501 "\\ua183\\u102d\\u0bec\\u003a", 3502 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 3503 "\\u003a\\u0e57\\u0fad\\u002e", 3504 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", 3505 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", 3506 "\\U000e005d\\u2044\\u0731\\u0650\\u0061", 3507 "\\u003a\\u0664\\u00b7\\u1fba", 3508 "\\u003b\\u0027\\u00b7\\u47a3", 3509 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b", 3510 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673", 3511 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c", 3512 }; 3513 int loop; 3514 if (U_FAILURE(status)) { 3515 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 3516 return; 3517 } 3518 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 3519 // printf("looping %d\n", loop); 3520 UnicodeString ustr = CharsToUnicodeString(strlist[loop]); 3521 // RBBICharMonkey monkey; 3522 RBBIWordMonkey monkey; 3523 3524 int expected[50]; 3525 int expectedcount = 0; 3526 3527 monkey.setText(ustr); 3528 int i; 3529 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 3530 expected[expectedcount ++] = i; 3531 } 3532 3533 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 3534 } 3535 delete bi; 3536 #endif 3537 } 3538 3539 void RBBITest::TestWordBoundary(void) 3540 { 3541 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data> 3542 Locale locale("en"); 3543 UErrorCode status = U_ZERO_ERROR; 3544 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 3545 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 3546 UChar str[50]; 3547 static const char *strlist[] = 3548 { 3549 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 3550 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 3551 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 3552 "\\u2027\\U000e0067\\u0a47\\u00b7", 3553 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 3554 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 3555 "\\u0589\\U000e006e\\u0a42\\U000104a5", 3556 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a", 3557 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 3558 "\\u0027\\u11af\\U000e0057\\u0602", 3559 "\\U0001d7f2\\U000e007\\u0004\\u0589", 3560 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b", 3561 "\\U0001d7f2\\U000e007d\\u0004\\u0589", 3562 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 3563 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 3564 "\\U000e0065\\u302c\\u09ee\\U000e0068", 3565 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 3566 "\\u0233\\U000e0020\\u0a69\\u0d6a", 3567 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 3568 "\\u58f4\\U000e0049\\u20e7\\u2027", 3569 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 3570 "\\ua183\\u102d\\u0bec\\u003a", 3571 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 3572 "\\u003a\\u0e57\\u0fad\\u002e", 3573 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", 3574 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", 3575 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019", 3576 "\\u003a\\u0664\\u00b7\\u1fba", 3577 "\\u003b\\u0027\\u00b7\\u47a3", 3578 }; 3579 int loop; 3580 if (U_FAILURE(status)) { 3581 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 3582 return; 3583 } 3584 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 3585 // printf("looping %d\n", loop); 3586 u_unescape(strlist[loop], str, 20); 3587 UnicodeString ustr(str); 3588 int forward[50]; 3589 int count = 0; 3590 3591 bi->setText(ustr); 3592 int prev = 0; 3593 int i; 3594 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) { 3595 forward[count ++] = i; 3596 if (i > prev) { 3597 int j; 3598 for (j = prev + 1; j < i; j ++) { 3599 if (bi->isBoundary(j)) { 3600 printStringBreaks(ustr, forward, count); 3601 errln("happy boundary test failed: expected %d not a boundary", 3602 j); 3603 return; 3604 } 3605 } 3606 } 3607 if (!bi->isBoundary(i)) { 3608 printStringBreaks(ustr, forward, count); 3609 errln("happy boundary test failed: expected %d a boundary", 3610 i); 3611 return; 3612 } 3613 prev = i; 3614 } 3615 } 3616 delete bi; 3617 } 3618 3619 void RBBITest::TestLineBreaks(void) 3620 { 3621 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 3622 Locale locale("en"); 3623 UErrorCode status = U_ZERO_ERROR; 3624 BreakIterator *bi = BreakIterator::createLineInstance(locale, status); 3625 const int32_t STRSIZE = 50; 3626 UChar str[STRSIZE]; 3627 static const char *strlist[] = 3628 { 3629 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc", 3630 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\" 3631 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d", 3632 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\" 3633 "u2014\\U000e0105\\u118c\\u000a\\u07f8", 3634 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f", 3635 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", 3636 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4", 3637 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", 3638 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060", 3639 "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5", 3640 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f", 3641 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1", 3642 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5", 3643 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0", 3644 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc", 3645 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f", 3646 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f", 3647 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b", 3648 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085", 3649 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac", 3650 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9", 3651 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025", 3652 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763", 3653 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029", 3654 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7", 3655 "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc", 3656 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a", 3657 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945", 3658 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014", 3659 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b", 3660 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0", 3661 "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025", 3662 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d", 3663 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111", 3664 "\\u2014\\u0020\\u000a\\u17c5\\u24fc", 3665 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f", 3666 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010", 3667 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43", 3668 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb", 3669 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc", 3670 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060", 3671 "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d" 3672 "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5" 3673 "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b", 3674 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0", 3675 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07", 3676 }; 3677 int loop; 3678 TEST_ASSERT_SUCCESS(status); 3679 if (U_FAILURE(status)) { 3680 return; 3681 } 3682 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 3683 // printf("looping %d\n", loop); 3684 int32_t t = u_unescape(strlist[loop], str, STRSIZE); 3685 if (t >= STRSIZE) { 3686 TEST_ASSERT(FALSE); 3687 continue; 3688 } 3689 3690 3691 UnicodeString ustr(str); 3692 RBBILineMonkey monkey; 3693 if (U_FAILURE(monkey.deferredStatus)) { 3694 continue; 3695 } 3696 3697 const int EXPECTEDSIZE = 50; 3698 int expected[EXPECTEDSIZE]; 3699 int expectedcount = 0; 3700 3701 monkey.setText(ustr); 3702 int i; 3703 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 3704 if (expectedcount >= EXPECTEDSIZE) { 3705 TEST_ASSERT(expectedcount < EXPECTEDSIZE); 3706 return; 3707 } 3708 expected[expectedcount ++] = i; 3709 } 3710 3711 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 3712 } 3713 delete bi; 3714 #endif 3715 } 3716 3717 void RBBITest::TestSentBreaks(void) 3718 { 3719 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 3720 Locale locale("en"); 3721 UErrorCode status = U_ZERO_ERROR; 3722 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status); 3723 UChar str[200]; 3724 static const char *strlist[] = 3725 { 3726 "Now\ris\nthe\r\ntime\n\rfor\r\r", 3727 "This\n", 3728 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.", 3729 "\"Sentence ending with a quote.\" Bye.", 3730 " (This is it). Testing the sentence iterator. \"This isn't it.\"", 3731 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"", 3732 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ", 3733 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ", 3734 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ", 3735 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!", 3736 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52" 3737 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a" 3738 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f" 3739 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352", 3740 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171" 3741 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030" 3742 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b" 3743 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b" 3744 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05" 3745 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4" 3746 }; 3747 int loop; 3748 if (U_FAILURE(status)) { 3749 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 3750 return; 3751 } 3752 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 3753 u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0]))); 3754 UnicodeString ustr(str); 3755 3756 RBBISentMonkey monkey; 3757 if (U_FAILURE(monkey.deferredStatus)) { 3758 continue; 3759 } 3760 3761 const int EXPECTEDSIZE = 50; 3762 int expected[EXPECTEDSIZE]; 3763 int expectedcount = 0; 3764 3765 monkey.setText(ustr); 3766 int i; 3767 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 3768 if (expectedcount >= EXPECTEDSIZE) { 3769 TEST_ASSERT(expectedcount < EXPECTEDSIZE); 3770 return; 3771 } 3772 expected[expectedcount ++] = i; 3773 } 3774 3775 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 3776 } 3777 delete bi; 3778 #endif 3779 } 3780 3781 void RBBITest::TestMonkey(char *params) { 3782 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 3783 3784 UErrorCode status = U_ZERO_ERROR; 3785 int32_t loopCount = 500; 3786 int32_t seed = 1; 3787 UnicodeString breakType = "all"; 3788 Locale locale("en"); 3789 UBool useUText = FALSE; 3790 3791 if (quick == FALSE) { 3792 loopCount = 10000; 3793 } 3794 3795 if (params) { 3796 UnicodeString p(params); 3797 loopCount = getIntParam("loop", p, loopCount); 3798 seed = getIntParam("seed", p, seed); 3799 3800 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status); 3801 if (m.find()) { 3802 breakType = m.group(1, status); 3803 m.reset(); 3804 p = m.replaceFirst("", status); 3805 } 3806 3807 RegexMatcher u(" *utext", p, 0, status); 3808 if (u.find()) { 3809 useUText = TRUE; 3810 u.reset(); 3811 p = u.replaceFirst("", status); 3812 } 3813 3814 3815 // m.reset(p); 3816 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) { 3817 // Each option is stripped out of the option string as it is processed. 3818 // All options have been checked. The option string should have been completely emptied.. 3819 char buf[100]; 3820 p.extract(buf, sizeof(buf), NULL, status); 3821 buf[sizeof(buf)-1] = 0; 3822 errln("Unrecognized or extra parameter: %s\n", buf); 3823 return; 3824 } 3825 3826 } 3827 3828 if (breakType == "char" || breakType == "all") { 3829 RBBICharMonkey m; 3830 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 3831 if (U_SUCCESS(status)) { 3832 RunMonkey(bi, m, "char", seed, loopCount, useUText); 3833 if (breakType == "all" && useUText==FALSE) { 3834 // Also run a quick test with UText when "all" is specified 3835 RunMonkey(bi, m, "char", seed, loopCount, TRUE); 3836 } 3837 } 3838 else { 3839 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status)); 3840 } 3841 delete bi; 3842 } 3843 3844 if (breakType == "word" || breakType == "all") { 3845 logln("Word Break Monkey Test"); 3846 RBBIWordMonkey m; 3847 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 3848 if (U_SUCCESS(status)) { 3849 RunMonkey(bi, m, "word", seed, loopCount, useUText); 3850 } 3851 else { 3852 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status)); 3853 } 3854 delete bi; 3855 } 3856 3857 if (breakType == "line" || breakType == "all") { 3858 logln("Line Break Monkey Test"); 3859 RBBILineMonkey m; 3860 BreakIterator *bi = BreakIterator::createLineInstance(locale, status); 3861 if (loopCount >= 10) { 3862 loopCount = loopCount / 5; // Line break runs slower than the others. 3863 } 3864 if (U_SUCCESS(status)) { 3865 RunMonkey(bi, m, "line", seed, loopCount, useUText); 3866 } 3867 else { 3868 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); 3869 } 3870 delete bi; 3871 } 3872 3873 if (breakType == "sent" || breakType == "all" ) { 3874 logln("Sentence Break Monkey Test"); 3875 RBBISentMonkey m; 3876 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status); 3877 if (loopCount >= 10) { 3878 loopCount = loopCount / 10; // Sentence runs slower than the other break types 3879 } 3880 if (U_SUCCESS(status)) { 3881 RunMonkey(bi, m, "sentence", seed, loopCount, useUText); 3882 } 3883 else { 3884 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); 3885 } 3886 delete bi; 3887 } 3888 3889 #endif 3890 } 3891 3892 // 3893 // Run a RBBI monkey test. Common routine, for all break iterator types. 3894 // Parameters: 3895 // bi - the break iterator to use 3896 // mk - MonkeyKind, abstraction for obtaining expected results 3897 // name - Name of test (char, word, etc.) for use in error messages 3898 // seed - Seed for starting random number generator (parameter from user) 3899 // numIterations 3900 // 3901 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed, 3902 int32_t numIterations, UBool useUText) { 3903 3904 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 3905 3906 const int32_t TESTSTRINGLEN = 500; 3907 UnicodeString testText; 3908 int32_t numCharClasses; 3909 UVector *chClasses; 3910 int expected[TESTSTRINGLEN*2 + 1]; 3911 int expectedCount = 0; 3912 char expectedBreaks[TESTSTRINGLEN*2 + 1]; 3913 char forwardBreaks[TESTSTRINGLEN*2 + 1]; 3914 char reverseBreaks[TESTSTRINGLEN*2+1]; 3915 char isBoundaryBreaks[TESTSTRINGLEN*2+1]; 3916 char followingBreaks[TESTSTRINGLEN*2+1]; 3917 char precedingBreaks[TESTSTRINGLEN*2+1]; 3918 int i; 3919 int loopCount = 0; 3920 3921 m_seed = seed; 3922 3923 numCharClasses = mk.charClasses()->size(); 3924 chClasses = mk.charClasses(); 3925 3926 // Check for errors that occured during the construction of the MonkeyKind object. 3927 // Can't report them where they occured because errln() is a method coming from intlTest, 3928 // and is not visible outside of RBBITest :-( 3929 if (U_FAILURE(mk.deferredStatus)) { 3930 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus)); 3931 return; 3932 } 3933 3934 // Verify that the character classes all have at least one member. 3935 for (i=0; i<numCharClasses; i++) { 3936 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i); 3937 if (s == NULL || s->size() == 0) { 3938 errln("Character Class #%d is null or of zero size.", i); 3939 return; 3940 } 3941 } 3942 3943 while (loopCount < numIterations || numIterations == -1) { 3944 if (numIterations == -1 && loopCount % 10 == 0) { 3945 // If test is running in an infinite loop, display a periodic tic so 3946 // we can tell that it is making progress. 3947 fprintf(stderr, "."); 3948 } 3949 // Save current random number seed, so that we can recreate the random numbers 3950 // for this loop iteration in event of an error. 3951 seed = m_seed; 3952 3953 // Populate a test string with data. 3954 testText.truncate(0); 3955 for (i=0; i<TESTSTRINGLEN; i++) { 3956 int32_t aClassNum = m_rand() % numCharClasses; 3957 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum); 3958 int32_t charIdx = m_rand() % classSet->size(); 3959 UChar32 c = classSet->charAt(charIdx); 3960 if (c < 0) { // TODO: deal with sets containing strings. 3961 errln("c < 0"); 3962 break; 3963 } 3964 testText.append(c); 3965 } 3966 3967 // Calculate the expected results for this test string. 3968 mk.setText(testText); 3969 memset(expectedBreaks, 0, sizeof(expectedBreaks)); 3970 expectedBreaks[0] = 1; 3971 int32_t breakPos = 0; 3972 expectedCount = 0; 3973 for (;;) { 3974 breakPos = mk.next(breakPos); 3975 if (breakPos == -1) { 3976 break; 3977 } 3978 if (breakPos > testText.length()) { 3979 errln("breakPos > testText.length()"); 3980 } 3981 expectedBreaks[breakPos] = 1; 3982 U_ASSERT(expectedCount<testText.length()); 3983 expected[expectedCount ++] = breakPos; 3984 } 3985 3986 // Find the break positions using forward iteration 3987 memset(forwardBreaks, 0, sizeof(forwardBreaks)); 3988 if (useUText) { 3989 UErrorCode status = U_ZERO_ERROR; 3990 UText *testUText = utext_openReplaceable(NULL, &testText, &status); 3991 // testUText = utext_openUnicodeString(testUText, &testText, &status); 3992 bi->setText(testUText, status); 3993 TEST_ASSERT_SUCCESS(status); 3994 utext_close(testUText); // The break iterator does a shallow clone of the UText 3995 // This UText can be closed immediately, so long as the 3996 // testText string continues to exist. 3997 } else { 3998 bi->setText(testText); 3999 } 4000 4001 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) { 4002 if (i < 0 || i > testText.length()) { 4003 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name); 4004 break; 4005 } 4006 forwardBreaks[i] = 1; 4007 } 4008 4009 // Find the break positions using reverse iteration 4010 memset(reverseBreaks, 0, sizeof(reverseBreaks)); 4011 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) { 4012 if (i < 0 || i > testText.length()) { 4013 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name); 4014 break; 4015 } 4016 reverseBreaks[i] = 1; 4017 } 4018 4019 // Find the break positions using isBoundary() tests. 4020 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks)); 4021 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length()); 4022 for (i=0; i<=testText.length(); i++) { 4023 isBoundaryBreaks[i] = bi->isBoundary(i); 4024 } 4025 4026 4027 // Find the break positions using the following() function. 4028 // printf("."); 4029 memset(followingBreaks, 0, sizeof(followingBreaks)); 4030 int32_t lastBreakPos = 0; 4031 followingBreaks[0] = 1; 4032 for (i=0; i<testText.length(); i++) { 4033 breakPos = bi->following(i); 4034 if (breakPos <= i || 4035 breakPos < lastBreakPos || 4036 breakPos > testText.length() || 4037 (breakPos > lastBreakPos && lastBreakPos > i)) { 4038 errln("%s break monkey test: " 4039 "Out of range value returned by BreakIterator::following().\n" 4040 "Random seed=%d index=%d; following returned %d; lastbreak=%d", 4041 name, seed, i, breakPos, lastBreakPos); 4042 break; 4043 } 4044 followingBreaks[breakPos] = 1; 4045 lastBreakPos = breakPos; 4046 } 4047 4048 // Find the break positions using the preceding() function. 4049 memset(precedingBreaks, 0, sizeof(precedingBreaks)); 4050 lastBreakPos = testText.length(); 4051 precedingBreaks[testText.length()] = 1; 4052 for (i=testText.length(); i>0; i--) { 4053 breakPos = bi->preceding(i); 4054 if (breakPos >= i || 4055 breakPos > lastBreakPos || 4056 (breakPos < 0 && testText.getChar32Start(i)>0) || 4057 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) { 4058 errln("%s break monkey test: " 4059 "Out of range value returned by BreakIterator::preceding().\n" 4060 "index=%d; prev returned %d; lastBreak=%d" , 4061 name, i, breakPos, lastBreakPos); 4062 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) { 4063 precedingBreaks[i] = 2; // Forces an error. 4064 } 4065 } else { 4066 if (breakPos >= 0) { 4067 precedingBreaks[breakPos] = 1; 4068 } 4069 lastBreakPos = breakPos; 4070 } 4071 } 4072 4073 // Compare the expected and actual results. 4074 for (i=0; i<=testText.length(); i++) { 4075 const char *errorType = NULL; 4076 if (forwardBreaks[i] != expectedBreaks[i]) { 4077 errorType = "next()"; 4078 } else if (reverseBreaks[i] != forwardBreaks[i]) { 4079 errorType = "previous()"; 4080 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) { 4081 errorType = "isBoundary()"; 4082 } else if (followingBreaks[i] != expectedBreaks[i]) { 4083 errorType = "following()"; 4084 } else if (precedingBreaks[i] != expectedBreaks[i]) { 4085 errorType = "preceding()"; 4086 } 4087 4088 4089 if (errorType != NULL) { 4090 // Format a range of the test text that includes the failure as 4091 // a data item that can be included in the rbbi test data file. 4092 4093 // Start of the range is the last point where expected and actual results 4094 // both agreed that there was a break position. 4095 int startContext = i; 4096 int32_t count = 0; 4097 for (;;) { 4098 if (startContext==0) { break; } 4099 startContext --; 4100 if (expectedBreaks[startContext] != 0) { 4101 if (count == 2) break; 4102 count ++; 4103 } 4104 } 4105 4106 // End of range is two expected breaks past the start position. 4107 int endContext = i + 1; 4108 int ci; 4109 for (ci=0; ci<2; ci++) { // Number of items to include in error text. 4110 for (;;) { 4111 if (endContext >= testText.length()) {break;} 4112 if (expectedBreaks[endContext-1] != 0) { 4113 if (count == 0) break; 4114 count --; 4115 } 4116 endContext ++; 4117 } 4118 } 4119 4120 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>" 4121 UnicodeString errorText = "<data>"; 4122 /***if (strcmp(errorType, "next()") == 0) { 4123 startContext = 0; 4124 endContext = testText.length(); 4125 4126 printStringBreaks(testText, expected, expectedCount); 4127 }***/ 4128 4129 for (ci=startContext; ci<endContext;) { 4130 UnicodeString hexChars("0123456789abcdef"); 4131 UChar32 c; 4132 int bn; 4133 c = testText.char32At(ci); 4134 if (ci == i) { 4135 // This is the location of the error. 4136 errorText.append("<?>"); 4137 } else if (expectedBreaks[ci] != 0) { 4138 // This a non-error expected break position. 4139 errorText.append("\\"); 4140 } 4141 if (c < 0x10000) { 4142 errorText.append("\\u"); 4143 for (bn=12; bn>=0; bn-=4) { 4144 errorText.append(hexChars.charAt((c>>bn)&0xf)); 4145 } 4146 } else { 4147 errorText.append("\\U"); 4148 for (bn=28; bn>=0; bn-=4) { 4149 errorText.append(hexChars.charAt((c>>bn)&0xf)); 4150 } 4151 } 4152 ci = testText.moveIndex32(ci, 1); 4153 } 4154 errorText.append("\\"); 4155 errorText.append("</data>\n"); 4156 4157 // Output the error 4158 char charErrorTxt[500]; 4159 UErrorCode status = U_ZERO_ERROR; 4160 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status); 4161 charErrorTxt[sizeof(charErrorTxt)-1] = 0; 4162 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status); 4163 4164 errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s", 4165 name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"), 4166 errorType, seed, i, charErrorTxt); 4167 break; 4168 } 4169 } 4170 4171 loopCount++; 4172 } 4173 #endif 4174 } 4175 4176 4177 // Bug 5532. UTF-8 based UText fails in dictionary code. 4178 // This test checks the initial patch, 4179 // which is to just keep it from crashing. Correct word boundaries 4180 // await a proper fix to the dictionary code. 4181 // 4182 void RBBITest::TestBug5532(void) { 4183 // Text includes a mixture of Thai and Latin. 4184 const unsigned char utf8Data[] = { 4185 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u, 4186 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u, 4187 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u, 4188 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 4189 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u, 4190 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u, 4191 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu, 4192 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u, 4193 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 4194 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u, 4195 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00}; 4196 4197 UErrorCode status = U_ZERO_ERROR; 4198 UText utext=UTEXT_INITIALIZER; 4199 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status); 4200 TEST_ASSERT_SUCCESS(status); 4201 4202 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status); 4203 TEST_ASSERT_SUCCESS(status); 4204 if (U_SUCCESS(status)) { 4205 bi->setText(&utext, status); 4206 TEST_ASSERT_SUCCESS(status); 4207 4208 int32_t breakCount = 0; 4209 int32_t previousBreak = -1; 4210 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) { 4211 // For now, just make sure that the break iterator doesn't hang. 4212 TEST_ASSERT(previousBreak < bi->current()); 4213 previousBreak = bi->current(); 4214 } 4215 TEST_ASSERT(breakCount > 0); 4216 } 4217 delete bi; 4218 utext_close(&utext); 4219 } 4220 4221 4222 void RBBITest::TestBug9983(void) { 4223 UnicodeString text = UnicodeString("\\u002A" // * Other 4224 "\\uFF65" // Other 4225 "\\u309C" // Katakana 4226 "\\uFF9F" // Extend 4227 "\\uFF65" // Other 4228 "\\u0020" // Other 4229 "\\u0000").unescape(); 4230 4231 UErrorCode status = U_ZERO_ERROR; 4232 LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>( 4233 BreakIterator::createWordInstance(Locale::getRoot(), status))); 4234 TEST_ASSERT_SUCCESS(status); 4235 if (U_FAILURE(status)) { 4236 return; 4237 } 4238 brkiter->setText(text); 4239 int32_t offset, rstatus; 4240 brkiter->last(); 4241 int32_t iterationCount = 0; 4242 while ( (offset = brkiter->previous()) != UBRK_DONE ) { 4243 iterationCount++; 4244 rstatus = brkiter->getRuleStatus(); 4245 // printf(" %d(%d)", offset, rstatus); 4246 if (iterationCount >= 10) { 4247 break; 4248 } 4249 } 4250 TEST_ASSERT(iterationCount == 6); 4251 } 4252 4253 4254 // 4255 // TestDebug - A place-holder test for debugging purposes. 4256 // For putting in fragments of other tests that can be invoked 4257 // for tracing without a lot of unwanted extra stuff happening. 4258 // 4259 void RBBITest::TestDebug(void) { 4260 #if 0 4261 UErrorCode status = U_ZERO_ERROR; 4262 int pos = 0; 4263 int ruleStatus = 0; 4264 4265 RuleBasedBreakIterator* bi = 4266 // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); 4267 // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status); 4268 (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status); 4269 UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e"); 4270 // UnicodeString s("Aaa. Bcd"); 4271 s = s.unescape(); 4272 bi->setText(s); 4273 UBool r = bi->isBoundary(8); 4274 printf("%s", r?"true":"false"); 4275 return; 4276 pos = bi->last(); 4277 do { 4278 // ruleStatus = bi->getRuleStatus(); 4279 printf("%d\t%d\n", pos, ruleStatus); 4280 pos = bi->previous(); 4281 } while (pos != BreakIterator::DONE); 4282 #endif 4283 } 4284 4285 void RBBITest::TestProperties() { 4286 UErrorCode errorCode = U_ZERO_ERROR; 4287 UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode); 4288 if (!prependSet.isEmpty()) { 4289 errln( 4290 "[:GCB=Prepend:] is not empty any more. " 4291 "Uncomment relevant lines in source/data/brkitr/char.txt and " 4292 "change this test to the opposite condition."); 4293 } 4294 } 4295 4296 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 4297