1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /******************************************************************** 4 * COPYRIGHT: 5 * Copyright (c) 1999-2016, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ********************************************************************/ 8 /************************************************************************ 9 * Date Name Description 10 * 12/15/99 Madhu Creation. 11 * 01/12/2000 Madhu Updated for changed API and added new tests 12 ************************************************************************/ 13 14 #include "unicode/utypes.h" 15 #if !UCONFIG_NO_BREAK_ITERATION 16 17 #include <stdio.h> 18 #include <stdlib.h> 19 #include <string.h> 20 21 #include "unicode/brkiter.h" 22 #include "unicode/localpointer.h" 23 #include "unicode/numfmt.h" 24 #include "unicode/rbbi.h" 25 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 26 #include "unicode/regex.h" 27 #endif 28 #include "unicode/schriter.h" 29 #include "unicode/uchar.h" 30 #include "unicode/utf16.h" 31 #include "unicode/ucnv.h" 32 #include "unicode/uniset.h" 33 #include "unicode/uscript.h" 34 #include "unicode/ustring.h" 35 #include "unicode/utext.h" 36 37 #include "charstr.h" 38 #include "cmemory.h" 39 #include "cstr.h" 40 #include "intltest.h" 41 #include "rbbitst.h" 42 #include "utypeinfo.h" // for 'typeid' to work 43 #include "uvector.h" 44 #include "uvectr32.h" 45 46 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION 47 #include "unicode/filteredbrk.h" 48 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION 49 50 #define TEST_ASSERT(x) {if (!(x)) { \ 51 errln("Failure in file %s, line %d", __FILE__, __LINE__);}} 52 53 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \ 54 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}} 55 56 //--------------------------------------------- 57 // runIndexedTest 58 //--------------------------------------------- 59 60 61 // Note: Before adding new tests to this file, check whether the desired test data can 62 // simply be added to the file testdata/rbbitest.txt. In most cases it can, 63 // it's much less work than writing a new test, diagnostic output in the event of failures 64 // is good, and the test data file will is shared with ICU4J, so eventually the test 65 // will run there as well, without additional effort. 66 67 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params ) 68 { 69 if (exec) logln("TestSuite RuleBasedBreakIterator: "); 70 fTestParams = params; 71 72 TESTCASE_AUTO_BEGIN; 73 #if !UCONFIG_NO_FILE_IO 74 TESTCASE_AUTO(TestBug4153072); 75 #endif 76 #if !UCONFIG_NO_FILE_IO 77 TESTCASE_AUTO(TestUnicodeFiles); 78 TESTCASE_AUTO(TestEmptyString); 79 #endif 80 TESTCASE_AUTO(TestGetAvailableLocales); 81 TESTCASE_AUTO(TestGetDisplayName); 82 #if !UCONFIG_NO_FILE_IO 83 TESTCASE_AUTO(TestEndBehaviour); 84 TESTCASE_AUTO(TestWordBreaks); 85 TESTCASE_AUTO(TestWordBoundary); 86 TESTCASE_AUTO(TestLineBreaks); 87 TESTCASE_AUTO(TestSentBreaks); 88 TESTCASE_AUTO(TestExtended); 89 #endif 90 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO 91 TESTCASE_AUTO(TestMonkey); 92 #endif 93 #if !UCONFIG_NO_FILE_IO 94 TESTCASE_AUTO(TestBug3818); 95 #endif 96 TESTCASE_AUTO(TestDebug); 97 #if !UCONFIG_NO_FILE_IO 98 TESTCASE_AUTO(TestBug5775); 99 #endif 100 TESTCASE_AUTO(TestBug9983); 101 TESTCASE_AUTO(TestDictRules); 102 TESTCASE_AUTO(TestBug5532); 103 TESTCASE_AUTO(TestBug7547); 104 TESTCASE_AUTO(TestBug12797); 105 TESTCASE_AUTO(TestBug12918); 106 TESTCASE_AUTO(TestBug12932); 107 TESTCASE_AUTO(TestEmoji); 108 TESTCASE_AUTO(TestBug12519); 109 TESTCASE_AUTO_END; 110 } 111 112 113 //--------------------------------------------------------------------------- 114 // 115 // class BITestData Holds a set of Break iterator test data and results 116 // Includes 117 // - the string data to be broken 118 // - a vector of the expected break positions. 119 // - a vector of source line numbers for the data, 120 // (to help see where errors occured.) 121 // - The expected break tag values. 122 // - Vectors of actual break positions and tag values. 123 // - Functions for comparing actual with expected and 124 // reporting errors. 125 // 126 //---------------------------------------------------------------------------- 127 class BITestData { 128 public: 129 UnicodeString fDataToBreak; 130 UVector fExpectedBreakPositions; 131 UVector fExpectedTags; 132 UVector fLineNum; 133 UVector fActualBreakPositions; // Test Results. 134 UVector fActualTags; 135 136 BITestData(UErrorCode &status); 137 void addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status); 138 void checkResults(const char *heading, RBBITest *test); 139 void err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx); 140 void clearResults(); 141 }; 142 143 // 144 // Constructor. 145 // 146 BITestData::BITestData(UErrorCode &status) 147 : fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fActualBreakPositions(status), 148 fActualTags(status) 149 { 150 } 151 152 // 153 // addDataChunk. Add a section (non-breaking) piece if data to the test data. 154 // The macro form collects the line number, which is helpful 155 // when tracking down failures. 156 // 157 // A null data item is inserted at the start of each test's data 158 // to put the starting zero into the data list. The position saved for 159 // each non-null item is its ending position. 160 // 161 #define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status); 162 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) { 163 if (U_FAILURE(status)) {return;} 164 if (data != NULL) { 165 fDataToBreak.append(CharsToUnicodeString(data)); 166 } 167 fExpectedBreakPositions.addElement(fDataToBreak.length(), status); 168 fExpectedTags.addElement(tag, status); 169 fLineNum.addElement(lineNum, status); 170 } 171 172 173 // 174 // checkResults. Compare the actual and expected break positions, report any differences. 175 // 176 void BITestData::checkResults(const char *heading, RBBITest *test) { 177 int32_t expectedIndex = 0; 178 int32_t actualIndex = 0; 179 180 for (;;) { 181 // If we've run through both the expected and actual results vectors, we're done. 182 // break out of the loop. 183 if (expectedIndex >= fExpectedBreakPositions.size() && 184 actualIndex >= fActualBreakPositions.size()) { 185 break; 186 } 187 188 189 if (expectedIndex >= fExpectedBreakPositions.size()) { 190 err(heading, test, expectedIndex-1, actualIndex); 191 actualIndex++; 192 continue; 193 } 194 195 if (actualIndex >= fActualBreakPositions.size()) { 196 err(heading, test, expectedIndex, actualIndex-1); 197 expectedIndex++; 198 continue; 199 } 200 201 if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) { 202 err(heading, test, expectedIndex, actualIndex); 203 // Try to resync the positions of the indices, to avoid a rash of spurious erros. 204 if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) { 205 actualIndex++; 206 } else { 207 expectedIndex++; 208 } 209 continue; 210 } 211 212 if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) { 213 test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d", 214 heading, fLineNum.elementAt(expectedIndex), 215 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex)); 216 } 217 218 actualIndex++; 219 expectedIndex++; 220 } 221 } 222 223 // 224 // err - An error was found. Report it, along with information about where the 225 // incorrectly broken test data appeared in the source file. 226 // 227 void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx) 228 { 229 int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx); 230 int32_t actual = fActualBreakPositions.elementAti(actualIdx); 231 int32_t o = 0; 232 int32_t line = fLineNum.elementAti(expectedIdx); 233 if (expectedIdx > 0) { 234 // The line numbers are off by one because a premature break occurs somewhere 235 // within the previous item, rather than at the start of the current (expected) item. 236 // We want to report the offset of the unexpected break from the start of 237 // this previous item. 238 o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1); 239 } 240 if (actual < expected) { 241 test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading, o, line, actual, expected); 242 } else { 243 test->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading, line, actual, expected); 244 } 245 } 246 247 248 void BITestData::clearResults() { 249 fActualBreakPositions.removeAllElements(); 250 fActualTags.removeAllElements(); 251 } 252 253 254 //-------------------------------------------------------------------------------------- 255 // 256 // RBBITest constructor and destructor 257 // 258 //-------------------------------------------------------------------------------------- 259 260 RBBITest::RBBITest() { 261 fTestParams = NULL; 262 } 263 264 265 RBBITest::~RBBITest() { 266 } 267 268 269 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) { 270 UErrorCode status = U_ZERO_ERROR; 271 char name[100]; 272 printf("code alpha extend alphanum type word sent line name\n"); 273 int nextExpectedIndex = 0; 274 utext_setNativeIndex(tstr, 0); 275 for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) { 276 if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) { 277 printf("------------------------------------------------ %d\n", j); 278 ++nextExpectedIndex; 279 } 280 281 UChar32 c = utext_next32(tstr); 282 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status); 283 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c, 284 u_isUAlphabetic(c), 285 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND), 286 u_isalnum(c), 287 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY, 288 u_charType(c), 289 U_SHORT_PROPERTY_NAME), 290 u_getPropertyValueName(UCHAR_WORD_BREAK, 291 u_getIntPropertyValue(c, 292 UCHAR_WORD_BREAK), 293 U_SHORT_PROPERTY_NAME), 294 u_getPropertyValueName(UCHAR_SENTENCE_BREAK, 295 u_getIntPropertyValue(c, 296 UCHAR_SENTENCE_BREAK), 297 U_SHORT_PROPERTY_NAME), 298 u_getPropertyValueName(UCHAR_LINE_BREAK, 299 u_getIntPropertyValue(c, 300 UCHAR_LINE_BREAK), 301 U_SHORT_PROPERTY_NAME), 302 name); 303 } 304 } 305 306 307 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) { 308 UErrorCode status = U_ZERO_ERROR; 309 UText *tstr = NULL; 310 tstr = utext_openConstUnicodeString(NULL, &ustr, &status); 311 if (U_FAILURE(status)) { 312 printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status)); 313 return; 314 } 315 printStringBreaks(tstr, expected, expectedCount); 316 utext_close(tstr); 317 } 318 319 320 void RBBITest::TestBug3818() { 321 UErrorCode status = U_ZERO_ERROR; 322 323 // Four Thai words... 324 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 325 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 }; 326 UnicodeString thaiStr(thaiWordData); 327 328 BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status); 329 if (U_FAILURE(status) || bi == NULL) { 330 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status)); 331 return; 332 } 333 bi->setText(thaiStr); 334 335 int32_t startOfSecondWord = bi->following(1); 336 if (startOfSecondWord != 4) { 337 errln("Fail at file %s, line %d expected start of word at 4, got %d", 338 __FILE__, __LINE__, startOfSecondWord); 339 } 340 startOfSecondWord = bi->following(0); 341 if (startOfSecondWord != 4) { 342 errln("Fail at file %s, line %d expected start of word at 4, got %d", 343 __FILE__, __LINE__, startOfSecondWord); 344 } 345 delete bi; 346 } 347 348 //---------------------------------------------------------------------------- 349 // 350 // generalIteratorTest Given a break iterator and a set of test data, 351 // Run the tests and report the results. 352 // 353 //---------------------------------------------------------------------------- 354 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td) 355 { 356 357 bi.setText(td.fDataToBreak); 358 359 testFirstAndNext(bi, td); 360 361 testLastAndPrevious(bi, td); 362 363 testFollowing(bi, td); 364 testPreceding(bi, td); 365 testIsBoundary(bi, td); 366 doMultipleSelectionTest(bi, td); 367 } 368 369 370 // 371 // testFirstAndNext. Run the iterator forwards in the obvious first(), next() 372 // kind of loop. 373 // 374 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td) 375 { 376 UErrorCode status = U_ZERO_ERROR; 377 int32_t p; 378 int32_t lastP = -1; 379 int32_t tag; 380 381 logln("Test first and next"); 382 bi.setText(td.fDataToBreak); 383 td.clearResults(); 384 385 for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) { 386 td.fActualBreakPositions.addElement(p, status); // Save result. 387 tag = bi.getRuleStatus(); 388 td.fActualTags.addElement(tag, status); 389 if (p <= lastP) { 390 // If the iterator is not making forward progress, stop. 391 // No need to raise an error here, it'll be detected in the normal check of results. 392 break; 393 } 394 lastP = p; 395 } 396 td.checkResults("testFirstAndNext", this); 397 } 398 399 400 // 401 // TestLastAndPrevious. Run the iterator backwards, starting with last(). 402 // 403 void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td) 404 { 405 UErrorCode status = U_ZERO_ERROR; 406 int32_t p; 407 int32_t lastP = 0x7ffffffe; 408 int32_t tag; 409 410 logln("Test last and previous"); 411 bi.setText(td.fDataToBreak); 412 td.clearResults(); 413 414 for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) { 415 // Save break position. Insert it at start of vector of results, shoving 416 // already-saved results further towards the end. 417 td.fActualBreakPositions.insertElementAt(p, 0, status); 418 // bi.previous(); // TODO: Why does this fix things up???? 419 // bi.next(); 420 tag = bi.getRuleStatus(); 421 td.fActualTags.insertElementAt(tag, 0, status); 422 if (p >= lastP) { 423 // If the iterator is not making progress, stop. 424 // No need to raise an error here, it'll be detected in the normal check of results. 425 break; 426 } 427 lastP = p; 428 } 429 td.checkResults("testLastAndPrevious", this); 430 } 431 432 433 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td) 434 { 435 UErrorCode status = U_ZERO_ERROR; 436 int32_t p; 437 int32_t tag; 438 int32_t lastP = -2; // A value that will never be returned as a break position. 439 // cannot be -1; that is returned for DONE. 440 int i; 441 442 logln("testFollowing():"); 443 bi.setText(td.fDataToBreak); 444 td.clearResults(); 445 446 // Save the starting point, since we won't get that out of following. 447 p = bi.first(); 448 td.fActualBreakPositions.addElement(p, status); // Save result. 449 tag = bi.getRuleStatus(); 450 td.fActualTags.addElement(tag, status); 451 452 for (i = 0; i <= td.fDataToBreak.length()+1; i++) { 453 p = bi.following(i); 454 if (p != lastP) { 455 if (p == RuleBasedBreakIterator::DONE) { 456 break; 457 } 458 // We've reached a new break position. Save it. 459 td.fActualBreakPositions.addElement(p, status); // Save result. 460 tag = bi.getRuleStatus(); 461 td.fActualTags.addElement(tag, status); 462 lastP = p; 463 } 464 } 465 // The loop normally exits by means of the break in the middle. 466 // Make sure that the index was at the correct position for the break iterator to have 467 // returned DONE. 468 if (i != td.fDataToBreak.length()) { 469 errln("testFollowing(): iterator returned DONE prematurely."); 470 } 471 472 // Full check of all results. 473 td.checkResults("testFollowing", this); 474 } 475 476 477 478 void RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) { 479 UErrorCode status = U_ZERO_ERROR; 480 int32_t p; 481 int32_t tag; 482 int32_t lastP = 0x7ffffffe; 483 int i; 484 485 logln("testPreceding():"); 486 bi.setText(td.fDataToBreak); 487 td.clearResults(); 488 489 p = bi.last(); 490 td.fActualBreakPositions.addElement(p, status); 491 tag = bi.getRuleStatus(); 492 td.fActualTags.addElement(tag, status); 493 494 for (i = td.fDataToBreak.length(); i>=-1; i--) { 495 p = bi.preceding(i); 496 if (p != lastP) { 497 if (p == RuleBasedBreakIterator::DONE) { 498 break; 499 } 500 // We've reached a new break position. Save it. 501 td.fActualBreakPositions.insertElementAt(p, 0, status); 502 lastP = p; 503 tag = bi.getRuleStatus(); 504 td.fActualTags.insertElementAt(tag, 0, status); 505 } 506 } 507 // The loop normally exits by means of the break in the middle. 508 // Make sure that the index was at the correct position for the break iterator to have 509 // returned DONE. 510 if (i != 0) { 511 errln("testPreceding(): iterator returned DONE prematurely."); 512 } 513 514 // Full check of all results. 515 td.checkResults("testPreceding", this); 516 } 517 518 519 520 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) { 521 UErrorCode status = U_ZERO_ERROR; 522 int i; 523 int32_t tag; 524 525 logln("testIsBoundary():"); 526 bi.setText(td.fDataToBreak); 527 td.clearResults(); 528 529 for (i = 0; i <= td.fDataToBreak.length(); i++) { 530 if (bi.isBoundary(i)) { 531 td.fActualBreakPositions.addElement(i, status); // Save result. 532 tag = bi.getRuleStatus(); 533 td.fActualTags.addElement(tag, status); 534 } 535 } 536 td.checkResults("testIsBoundary: ", this); 537 } 538 539 540 541 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td) 542 { 543 iterator.setText(td.fDataToBreak); 544 545 RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone(); 546 int32_t offset = iterator.first(); 547 int32_t testOffset; 548 int32_t count = 0; 549 550 logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length()); 551 552 if (*testIterator != iterator) 553 errln("clone() or operator!= failed: two clones compared unequal"); 554 555 do { 556 testOffset = testIterator->first(); 557 testOffset = testIterator->next(count); 558 if (offset != testOffset) 559 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset); 560 561 if (offset != RuleBasedBreakIterator::DONE) { 562 count++; 563 offset = iterator.next(); 564 565 if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) { 566 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset); 567 if (count > 10000 || offset == -1) { 568 errln("operator== failed too many times. Stopping test."); 569 if (offset == -1) { 570 errln("Does (RuleBasedBreakIterator::DONE == -1)?"); 571 } 572 return; 573 } 574 } 575 } 576 } while (offset != RuleBasedBreakIterator::DONE); 577 578 // now do it backwards... 579 offset = iterator.last(); 580 count = 0; 581 582 do { 583 testOffset = testIterator->last(); 584 testOffset = testIterator->next(count); // next() with a negative arg is same as previous 585 if (offset != testOffset) 586 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset); 587 588 if (offset != RuleBasedBreakIterator::DONE) { 589 count--; 590 offset = iterator.previous(); 591 } 592 } while (offset != RuleBasedBreakIterator::DONE); 593 594 delete testIterator; 595 } 596 597 598 //--------------------------------------------- 599 // 600 // other tests 601 // 602 //--------------------------------------------- 603 void RBBITest::TestEmptyString() 604 { 605 UnicodeString text = ""; 606 UErrorCode status = U_ZERO_ERROR; 607 608 BITestData x(status); 609 ADD_DATACHUNK(x, "", 0, status); // Break at start of data 610 RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); 611 if (U_FAILURE(status)) 612 { 613 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status)); 614 return; 615 } 616 generalIteratorTest(*bi, x); 617 delete bi; 618 } 619 620 void RBBITest::TestGetAvailableLocales() 621 { 622 int32_t locCount = 0; 623 const Locale* locList = BreakIterator::getAvailableLocales(locCount); 624 625 if (locCount == 0) 626 dataerrln("getAvailableLocales() returned an empty list!"); 627 // Just make sure that it's returning good memory. 628 int32_t i; 629 for (i = 0; i < locCount; ++i) { 630 logln(locList[i].getName()); 631 } 632 } 633 634 //Testing the BreakIterator::getDisplayName() function 635 void RBBITest::TestGetDisplayName() 636 { 637 UnicodeString result; 638 639 BreakIterator::getDisplayName(Locale::getUS(), result); 640 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)") 641 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \"" 642 + result); 643 644 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result); 645 if (result != "French (France)") 646 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \"" 647 + result); 648 } 649 /** 650 * Test End Behaviour 651 * @bug 4068137 652 */ 653 void RBBITest::TestEndBehaviour() 654 { 655 UErrorCode status = U_ZERO_ERROR; 656 UnicodeString testString("boo."); 657 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status); 658 if (U_FAILURE(status)) 659 { 660 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status)); 661 return; 662 } 663 wb->setText(testString); 664 665 if (wb->first() != 0) 666 errln("Didn't get break at beginning of string."); 667 if (wb->next() != 3) 668 errln("Didn't get break before period in \"boo.\""); 669 if (wb->current() != 4 && wb->next() != 4) 670 errln("Didn't get break at end of string."); 671 delete wb; 672 } 673 /* 674 * @bug 4153072 675 */ 676 void RBBITest::TestBug4153072() { 677 UErrorCode status = U_ZERO_ERROR; 678 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status); 679 if (U_FAILURE(status)) 680 { 681 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status)); 682 return; 683 } 684 UnicodeString str("...Hello, World!..."); 685 int32_t begin = 3; 686 int32_t end = str.length() - 3; 687 UBool onBoundary; 688 689 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin); 690 iter->adoptText(textIterator); 691 int index; 692 // Note: with the switch to UText, there is no way to restrict the 693 // iteration range to begin at an index other than zero. 694 // String character iterators created with a non-zero bound are 695 // treated by RBBI as being empty. 696 for (index = -1; index < begin + 1; ++index) { 697 onBoundary = iter->isBoundary(index); 698 if (index == 0? !onBoundary : onBoundary) { 699 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index + 700 " and begin index = " + begin); 701 } 702 } 703 delete iter; 704 } 705 706 707 // 708 // Test for problem reported by Ashok Matoria on 9 July 2007 709 // One.<kSoftHyphen><kSpace>Two. 710 // 711 // Sentence break at start (0) and then on calling next() it breaks at 712 // 'T' of "Two". Now, at this point if I do next() and 713 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two". 714 // 715 void RBBITest::TestBug5775() { 716 UErrorCode status = U_ZERO_ERROR; 717 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status); 718 TEST_ASSERT_SUCCESS(status); 719 if (U_FAILURE(status)) { 720 return; 721 } 722 // Check for status first for better handling of no data errors. 723 TEST_ASSERT(bi != NULL); 724 if (bi == NULL) { 725 return; 726 } 727 728 UnicodeString s("One.\\u00ad Two.", -1, US_INV); 729 // 01234 56789 730 s = s.unescape(); 731 bi->setText(s); 732 int pos = bi->next(); 733 TEST_ASSERT(pos == 6); 734 pos = bi->next(); 735 TEST_ASSERT(pos == 10); 736 pos = bi->previous(); 737 TEST_ASSERT(pos == 6); 738 delete bi; 739 } 740 741 742 743 //------------------------------------------------------------------------------ 744 // 745 // RBBITest::Extended Run RBBI Tests from an external test data file 746 // 747 //------------------------------------------------------------------------------ 748 749 struct TestParams { 750 BreakIterator *bi; // Break iterator is set while parsing test source. 751 // Changed out whenever test data changes break type. 752 753 UnicodeString dataToBreak; // Data that is built up while parsing the test. 754 UVector32 *expectedBreaks; // Expected break positions, matches dataToBreak UnicodeString. 755 UVector32 *srcLine; // Positions in source file, indexed same as dataToBreak. 756 UVector32 *srcCol; 757 758 UText *textToBreak; // UText, could be UTF8 or UTF16. 759 UVector32 *textMap; // Map from UTF-16 dataToBreak offsets to UText offsets. 760 CharString utf8String; // UTF-8 form of text to break. 761 762 TestParams(UErrorCode &status) : dataToBreak() { 763 bi = NULL; 764 expectedBreaks = new UVector32(status); 765 srcLine = new UVector32(status); 766 srcCol = new UVector32(status); 767 textToBreak = NULL; 768 textMap = new UVector32(status); 769 } 770 771 ~TestParams() { 772 delete bi; 773 delete expectedBreaks; 774 delete srcLine; 775 delete srcCol; 776 utext_close(textToBreak); 777 delete textMap; 778 } 779 780 int32_t getSrcLine(int32_t bp); 781 int32_t getExpectedBreak(int32_t bp); 782 int32_t getSrcCol(int32_t bp); 783 784 void setUTF16(UErrorCode &status); 785 void setUTF8(UErrorCode &status); 786 }; 787 788 // Append a UnicodeString to a CharString with UTF-8 encoding. 789 // Substitute any invalid chars. 790 // Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted. 791 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) { 792 if (U_FAILURE(status)) { 793 return; 794 } 795 int32_t utf8Length; 796 u_strToUTF8WithSub(NULL, 0, &utf8Length, // Output Buffer, NULL for preflight. 797 src.getBuffer(), src.length(), // UTF-16 data 798 0xfffd, NULL, // Substitution char, number of subs. 799 &status); 800 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) { 801 return; 802 } 803 status = U_ZERO_ERROR; 804 int32_t capacity; 805 char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status); 806 u_strToUTF8WithSub(buffer, utf8Length, NULL, 807 src.getBuffer(), src.length(), 808 0xfffd, NULL, &status); 809 dest.append(buffer, utf8Length, status); 810 } 811 812 813 void TestParams::setUTF16(UErrorCode &status) { 814 textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status); 815 textMap->removeAllElements(); 816 for (int32_t i=0; i<dataToBreak.length(); i++) { 817 if (i == dataToBreak.getChar32Start(i)) { 818 textMap->addElement(i, status); 819 } else { 820 textMap->addElement(-1, status); 821 } 822 } 823 textMap->addElement(dataToBreak.length(), status); 824 U_ASSERT(dataToBreak.length() + 1 == textMap->size()); 825 } 826 827 828 void TestParams::setUTF8(UErrorCode &status) { 829 if (U_FAILURE(status)) { 830 return; 831 } 832 utf8String.clear(); 833 CharStringAppend(utf8String, dataToBreak, status); 834 textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status); 835 if (U_FAILURE(status)) { 836 return; 837 } 838 839 textMap->removeAllElements(); 840 int32_t utf16Index = 0; 841 for (;;) { 842 textMap->addElement(utf16Index, status); 843 UChar32 c32 = utext_current32(textToBreak); 844 if (c32 < 0) { 845 break; 846 } 847 utf16Index += U16_LENGTH(c32); 848 utext_next32(textToBreak); 849 while (textMap->size() < utext_getNativeIndex(textToBreak)) { 850 textMap->addElement(-1, status); 851 } 852 } 853 U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size()); 854 } 855 856 857 int32_t TestParams::getSrcLine(int32_t bp) { 858 if (bp >= textMap->size()) { 859 bp = textMap->size() - 1; 860 } 861 int32_t i = 0; 862 for(; bp >= 0 ; --bp) { 863 // Move to a character boundary if we are not on one already. 864 i = textMap->elementAti(bp); 865 if (i >= 0) { 866 break; 867 } 868 } 869 return srcLine->elementAti(i); 870 } 871 872 873 int32_t TestParams::getExpectedBreak(int32_t bp) { 874 if (bp >= textMap->size()) { 875 return 0; 876 } 877 int32_t i = textMap->elementAti(bp); 878 int32_t retVal = 0; 879 if (i >= 0) { 880 retVal = expectedBreaks->elementAti(i); 881 } 882 return retVal; 883 } 884 885 886 int32_t TestParams::getSrcCol(int32_t bp) { 887 if (bp >= textMap->size()) { 888 bp = textMap->size() - 1; 889 } 890 int32_t i = 0; 891 for(; bp >= 0; --bp) { 892 // Move bp to a character boundary if we are not on one already. 893 i = textMap->elementAti(bp); 894 if (i >= 0) { 895 break; 896 } 897 } 898 return srcCol->elementAti(i); 899 } 900 901 902 void RBBITest::executeTest(TestParams *t, UErrorCode &status) { 903 int32_t bp; 904 int32_t prevBP; 905 int32_t i; 906 907 TEST_ASSERT_SUCCESS(status); 908 if (U_FAILURE(status)) { 909 return; 910 } 911 912 if (t->bi == NULL) { 913 return; 914 } 915 916 t->bi->setText(t->textToBreak, status); 917 // 918 // Run the iterator forward 919 // 920 prevBP = -1; 921 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) { 922 if (prevBP == bp) { 923 // Fail for lack of forward progress. 924 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d", 925 bp, t->getSrcLine(bp), t->getSrcCol(bp)); 926 break; 927 } 928 929 // Check that there we didn't miss an expected break between the last one 930 // and this one. 931 for (i=prevBP+1; i<bp; i++) { 932 if (t->getExpectedBreak(i) != 0) { 933 int expected[] = {0, i}; 934 printStringBreaks(t->dataToBreak, expected, 2); 935 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 936 i, t->getSrcLine(i), t->getSrcCol(i)); 937 } 938 } 939 940 // Check that the break we did find was expected 941 if (t->getExpectedBreak(bp) == 0) { 942 int expected[] = {0, bp}; 943 printStringBreaks(t->textToBreak, expected, 2); 944 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d", 945 bp, t->getSrcLine(bp), t->getSrcCol(bp)); 946 } else { 947 // The break was expected. 948 // Check that the {nnn} tag value is correct. 949 int32_t expectedTagVal = t->getExpectedBreak(bp); 950 if (expectedTagVal == -1) { 951 expectedTagVal = 0; 952 } 953 int32_t line = t->getSrcLine(bp); 954 int32_t rs = t->bi->getRuleStatus(); 955 if (rs != expectedTagVal) { 956 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n" 957 " Actual, Expected status = %4d, %4d", 958 bp, line, t->getSrcCol(bp), rs, expectedTagVal); 959 } 960 } 961 962 prevBP = bp; 963 } 964 965 // Verify that there were no missed expected breaks after the last one found 966 for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) { 967 if (t->getExpectedBreak(i) != 0) { 968 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 969 i, t->getSrcLine(i), t->getSrcCol(i)); 970 } 971 } 972 973 // 974 // Run the iterator backwards, verify that the same breaks are found. 975 // 976 prevBP = utext_nativeLength(t->textToBreak)+2; // start with a phony value for the last break pos seen. 977 bp = t->bi->last(); 978 while (bp != BreakIterator::DONE) { 979 if (prevBP == bp) { 980 // Fail for lack of progress. 981 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d", 982 bp, t->getSrcLine(bp), t->getSrcCol(bp)); 983 break; 984 } 985 986 // Check that we didn't miss an expected break between the last one 987 // and this one. (UVector returns zeros for index out of bounds.) 988 for (i=prevBP-1; i>bp; i--) { 989 if (t->getExpectedBreak(i) != 0) { 990 errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 991 i, t->getSrcLine(i), t->getSrcCol(i)); 992 } 993 } 994 995 // Check that the break we did find was expected 996 if (t->getExpectedBreak(bp) == 0) { 997 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d", 998 bp, t->getSrcLine(bp), t->getSrcCol(bp)); 999 } else { 1000 // The break was expected. 1001 // Check that the {nnn} tag value is correct. 1002 int32_t expectedTagVal = t->getExpectedBreak(bp); 1003 if (expectedTagVal == -1) { 1004 expectedTagVal = 0; 1005 } 1006 int line = t->getSrcLine(bp); 1007 int32_t rs = t->bi->getRuleStatus(); 1008 if (rs != expectedTagVal) { 1009 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n" 1010 " Actual, Expected status = %4d, %4d", 1011 bp, line, t->getSrcCol(bp), rs, expectedTagVal); 1012 } 1013 } 1014 1015 prevBP = bp; 1016 bp = t->bi->previous(); 1017 } 1018 1019 // Verify that there were no missed breaks prior to the last one found 1020 for (i=prevBP-1; i>=0; i--) { 1021 if (t->getExpectedBreak(i) != 0) { 1022 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1023 i, t->getSrcLine(i), t->getSrcCol(i)); 1024 } 1025 } 1026 1027 // Check isBoundary() 1028 for (i=0; i < utext_nativeLength(t->textToBreak); i++) { 1029 UBool boundaryExpected = (t->getExpectedBreak(i) != 0); 1030 UBool boundaryFound = t->bi->isBoundary(i); 1031 if (boundaryExpected != boundaryFound) { 1032 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n" 1033 " Expected, Actual= %s, %s", 1034 i, t->getSrcLine(i), t->getSrcCol(i), 1035 boundaryExpected ? "true":"false", boundaryFound? "true" : "false"); 1036 } 1037 } 1038 1039 // Check following() 1040 for (i=0; i < utext_nativeLength(t->textToBreak); i++) { 1041 int32_t actualBreak = t->bi->following(i); 1042 int32_t expectedBreak = BreakIterator::DONE; 1043 for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) { 1044 if (t->getExpectedBreak(j) != 0) { 1045 expectedBreak = j; 1046 break; 1047 } 1048 } 1049 if (expectedBreak != actualBreak) { 1050 errln("following(%d) incorrect. File line,col= %4d,%4d\n" 1051 " Expected, Actual= %d, %d", 1052 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak); 1053 } 1054 } 1055 1056 // Check preceding() 1057 for (i=utext_nativeLength(t->textToBreak); i>=0; i--) { 1058 int32_t actualBreak = t->bi->preceding(i); 1059 int32_t expectedBreak = BreakIterator::DONE; 1060 1061 // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent. 1062 // preceding(trailing byte) will return the index of some preceding code point, 1063 // not the lead byte of the current code point, even though that has a smaller index. 1064 // Therefore, start looking at the expected break data not at i-1, but at 1065 // the start of code point index - 1. 1066 utext_setNativeIndex(t->textToBreak, i); 1067 int32_t j = utext_getNativeIndex(t->textToBreak) - 1; 1068 for (; j >= 0; j--) { 1069 if (t->getExpectedBreak(j) != 0) { 1070 expectedBreak = j; 1071 break; 1072 } 1073 } 1074 if (expectedBreak != actualBreak) { 1075 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n" 1076 " Expected, Actual= %d, %d", 1077 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak); 1078 } 1079 } 1080 } 1081 1082 1083 void RBBITest::TestExtended() { 1084 // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This 1085 // data driven test closely entangles filtered and regular data. 1086 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION 1087 UErrorCode status = U_ZERO_ERROR; 1088 Locale locale(""); 1089 1090 TestParams tp(status); 1091 1092 RegexMatcher localeMatcher(UnicodeString(u"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status); 1093 if (U_FAILURE(status)) { 1094 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status)); 1095 } 1096 1097 // 1098 // Open and read the test data file. 1099 // 1100 const char *testDataDirectory = IntlTest::getSourceTestData(status); 1101 CharString testFileName(testDataDirectory, -1, status); 1102 testFileName.append("rbbitst.txt", -1, status); 1103 1104 int len; 1105 UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status); 1106 if (U_FAILURE(status)) { 1107 errln("%s:%d Error %s opening file rbbitst.txt", __FILE__, __LINE__, u_errorName(status)); 1108 return; 1109 } 1110 1111 bool skipTest = false; // Skip this test? 1112 1113 // 1114 // Put the test data into a UnicodeString 1115 // 1116 UnicodeString testString(FALSE, testFile, len); 1117 1118 enum EParseState{ 1119 PARSE_COMMENT, 1120 PARSE_TAG, 1121 PARSE_DATA, 1122 PARSE_NUM, 1123 PARSE_RULES 1124 } 1125 parseState = PARSE_TAG; 1126 1127 EParseState savedState = PARSE_TAG; 1128 1129 int32_t lineNum = 1; 1130 int32_t colStart = 0; 1131 int32_t column = 0; 1132 int32_t charIdx = 0; 1133 1134 int32_t tagValue = 0; // The numeric value of a <nnn> tag. 1135 1136 UnicodeString rules; // Holds rules from a <rules> ... </rules> block 1137 int32_t rulesFirstLine; // Line number of the start of current <rules> block 1138 1139 for (charIdx = 0; charIdx < len; ) { 1140 status = U_ZERO_ERROR; 1141 UChar c = testString.charAt(charIdx); 1142 charIdx++; 1143 if (c == u'\r' && charIdx<len && testString.charAt(charIdx) == u'\n') { 1144 // treat CRLF as a unit 1145 c = u'\n'; 1146 charIdx++; 1147 } 1148 if (c == u'\n' || c == u'\r') { 1149 lineNum++; 1150 colStart = charIdx; 1151 } 1152 column = charIdx - colStart + 1; 1153 1154 switch (parseState) { 1155 case PARSE_COMMENT: 1156 if (c == u'\n' || c == u'\r') { 1157 parseState = savedState; 1158 } 1159 break; 1160 1161 case PARSE_TAG: 1162 { 1163 if (c == u'#') { 1164 parseState = PARSE_COMMENT; 1165 savedState = PARSE_TAG; 1166 break; 1167 } 1168 if (u_isUWhiteSpace(c)) { 1169 break; 1170 } 1171 if (testString.compare(charIdx-1, 6, u"<word>") == 0) { 1172 delete tp.bi; 1173 tp.bi = BreakIterator::createWordInstance(locale, status); 1174 skipTest = false; 1175 charIdx += 5; 1176 break; 1177 } 1178 if (testString.compare(charIdx-1, 6, u"<char>") == 0) { 1179 delete tp.bi; 1180 tp.bi = BreakIterator::createCharacterInstance(locale, status); 1181 skipTest = false; 1182 charIdx += 5; 1183 break; 1184 } 1185 if (testString.compare(charIdx-1, 6, u"<line>") == 0) { 1186 delete tp.bi; 1187 tp.bi = BreakIterator::createLineInstance(locale, status); 1188 skipTest = false; 1189 charIdx += 5; 1190 break; 1191 } 1192 if (testString.compare(charIdx-1, 6, u"<sent>") == 0) { 1193 delete tp.bi; 1194 tp.bi = BreakIterator::createSentenceInstance(locale, status); 1195 skipTest = false; 1196 charIdx += 5; 1197 break; 1198 } 1199 if (testString.compare(charIdx-1, 7, u"<title>") == 0) { 1200 delete tp.bi; 1201 tp.bi = BreakIterator::createTitleInstance(locale, status); 1202 charIdx += 6; 1203 break; 1204 } 1205 1206 if (testString.compare(charIdx-1, 7, u"<rules>") == 0 || 1207 testString.compare(charIdx-1, 10, u"<badrules>") == 0) { 1208 charIdx = testString.indexOf(u'>', charIdx) + 1; 1209 parseState = PARSE_RULES; 1210 rules.remove(); 1211 rulesFirstLine = lineNum; 1212 break; 1213 } 1214 1215 // <locale loc_name> 1216 localeMatcher.reset(testString); 1217 if (localeMatcher.lookingAt(charIdx-1, status)) { 1218 UnicodeString localeName = localeMatcher.group(1, status); 1219 char localeName8[100]; 1220 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0); 1221 locale = Locale::createFromName(localeName8); 1222 charIdx += localeMatcher.group(0, status).length() - 1; 1223 TEST_ASSERT_SUCCESS(status); 1224 break; 1225 } 1226 if (testString.compare(charIdx-1, 6, u"<data>") == 0) { 1227 parseState = PARSE_DATA; 1228 charIdx += 5; 1229 tp.dataToBreak = ""; 1230 tp.expectedBreaks->removeAllElements(); 1231 tp.srcCol ->removeAllElements(); 1232 tp.srcLine->removeAllElements(); 1233 break; 1234 } 1235 1236 errln("line %d: Tag expected in test file.", lineNum); 1237 parseState = PARSE_COMMENT; 1238 savedState = PARSE_DATA; 1239 goto end_test; // Stop the test. 1240 } 1241 break; 1242 1243 case PARSE_RULES: 1244 if (testString.compare(charIdx-1, 8, u"</rules>") == 0) { 1245 charIdx += 7; 1246 parseState = PARSE_TAG; 1247 delete tp.bi; 1248 UParseError pe; 1249 tp.bi = new RuleBasedBreakIterator(rules, pe, status); 1250 skipTest = U_FAILURE(status); 1251 if (U_FAILURE(status)) { 1252 errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.", 1253 rulesFirstLine + pe.line - 1, u_errorName(status)); 1254 } 1255 } else if (testString.compare(charIdx-1, 11, u"</badrules>") == 0) { 1256 charIdx += 10; 1257 parseState = PARSE_TAG; 1258 UErrorCode ec = U_ZERO_ERROR; 1259 UParseError pe; 1260 RuleBasedBreakIterator bi(rules, pe, ec); 1261 if (U_SUCCESS(ec)) { 1262 errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.", 1263 rulesFirstLine + pe.line - 1); 1264 } 1265 } else { 1266 rules.append(c); 1267 } 1268 break; 1269 1270 case PARSE_DATA: 1271 if (c == u'') { 1272 int32_t breakIdx = tp.dataToBreak.length(); 1273 tp.expectedBreaks->setSize(breakIdx+1); 1274 tp.expectedBreaks->setElementAt(-1, breakIdx); 1275 tp.srcLine->setSize(breakIdx+1); 1276 tp.srcLine->setElementAt(lineNum, breakIdx); 1277 tp.srcCol ->setSize(breakIdx+1); 1278 tp.srcCol ->setElementAt(column, breakIdx); 1279 break; 1280 } 1281 1282 if (testString.compare(charIdx-1, 7, u"</data>") == 0) { 1283 // Add final entry to mappings from break location to source file position. 1284 // Need one extra because last break position returned is after the 1285 // last char in the data, not at the last char. 1286 tp.srcLine->addElement(lineNum, status); 1287 tp.srcCol ->addElement(column, status); 1288 1289 parseState = PARSE_TAG; 1290 charIdx += 6; 1291 1292 if (!skipTest) { 1293 // RUN THE TEST! 1294 status = U_ZERO_ERROR; 1295 tp.setUTF16(status); 1296 executeTest(&tp, status); 1297 TEST_ASSERT_SUCCESS(status); 1298 1299 // Run again, this time with UTF-8 text wrapped in a UText. 1300 status = U_ZERO_ERROR; 1301 tp.setUTF8(status); 1302 TEST_ASSERT_SUCCESS(status); 1303 executeTest(&tp, status); 1304 } 1305 break; 1306 } 1307 1308 if (testString.compare(charIdx-1, 3, u"\\N{") == 0) { 1309 // Named character, e.g. \N{COMBINING GRAVE ACCENT} 1310 // Get the code point from the name and insert it into the test data. 1311 // (Damn, no API takes names in Unicode !!! 1312 // we've got to take it back to char *) 1313 int32_t nameEndIdx = testString.indexOf(u'}', charIdx); 1314 int32_t nameLength = nameEndIdx - (charIdx+2); 1315 char charNameBuf[200]; 1316 UChar32 theChar = -1; 1317 if (nameEndIdx != -1) { 1318 UErrorCode status = U_ZERO_ERROR; 1319 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf)); 1320 charNameBuf[sizeof(charNameBuf)-1] = 0; 1321 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status); 1322 if (U_FAILURE(status)) { 1323 theChar = -1; 1324 } 1325 } 1326 if (theChar == -1) { 1327 errln("Error in named character in test file at line %d, col %d", 1328 lineNum, column); 1329 } else { 1330 // Named code point was recognized. Insert it 1331 // into the test data. 1332 tp.dataToBreak.append(theChar); 1333 while (tp.dataToBreak.length() > tp.srcLine->size()) { 1334 tp.srcLine->addElement(lineNum, status); 1335 tp.srcCol ->addElement(column, status); 1336 } 1337 } 1338 if (nameEndIdx > charIdx) { 1339 charIdx = nameEndIdx+1; 1340 1341 } 1342 break; 1343 } 1344 1345 1346 1347 if (testString.compare(charIdx-1, 2, u"<>") == 0) { 1348 charIdx++; 1349 int32_t breakIdx = tp.dataToBreak.length(); 1350 tp.expectedBreaks->setSize(breakIdx+1); 1351 tp.expectedBreaks->setElementAt(-1, breakIdx); 1352 tp.srcLine->setSize(breakIdx+1); 1353 tp.srcLine->setElementAt(lineNum, breakIdx); 1354 tp.srcCol ->setSize(breakIdx+1); 1355 tp.srcCol ->setElementAt(column, breakIdx); 1356 break; 1357 } 1358 1359 if (c == u'<') { 1360 tagValue = 0; 1361 parseState = PARSE_NUM; 1362 break; 1363 } 1364 1365 if (c == u'#' && column==3) { // TODO: why is column off so far? 1366 parseState = PARSE_COMMENT; 1367 savedState = PARSE_DATA; 1368 break; 1369 } 1370 1371 if (c == u'\\') { 1372 // Check for \ at end of line, a line continuation. 1373 // Advance over (discard) the newline 1374 UChar32 cp = testString.char32At(charIdx); 1375 if (cp == u'\r' && charIdx<len && testString.charAt(charIdx+1) == u'\n') { 1376 // We have a CR LF 1377 // Need an extra increment of the input ptr to move over both of them 1378 charIdx++; 1379 } 1380 if (cp == u'\n' || cp == u'\r') { 1381 lineNum++; 1382 colStart = charIdx; 1383 charIdx++; 1384 break; 1385 } 1386 1387 // Let unescape handle the back slash. 1388 cp = testString.unescapeAt(charIdx); 1389 if (cp != -1) { 1390 // Escape sequence was recognized. Insert the char 1391 // into the test data. 1392 tp.dataToBreak.append(cp); 1393 while (tp.dataToBreak.length() > tp.srcLine->size()) { 1394 tp.srcLine->addElement(lineNum, status); 1395 tp.srcCol ->addElement(column, status); 1396 } 1397 break; 1398 } 1399 1400 1401 // Not a recognized backslash escape sequence. 1402 // Take the next char as a literal. 1403 // TODO: Should this be an error? 1404 c = testString.charAt(charIdx); 1405 charIdx = testString.moveIndex32(charIdx, 1); 1406 } 1407 1408 // Normal, non-escaped data char. 1409 tp.dataToBreak.append(c); 1410 1411 // Save the mapping from offset in the data to line/column numbers in 1412 // the original input file. Will be used for better error messages only. 1413 // If there's an expected break before this char, the slot in the mapping 1414 // vector will already be set for this char; don't overwrite it. 1415 if (tp.dataToBreak.length() > tp.srcLine->size()) { 1416 tp.srcLine->addElement(lineNum, status); 1417 tp.srcCol ->addElement(column, status); 1418 } 1419 break; 1420 1421 1422 case PARSE_NUM: 1423 // We are parsing an expected numeric tag value, like <1234>, 1424 // within a chunk of data. 1425 if (u_isUWhiteSpace(c)) { 1426 break; 1427 } 1428 1429 if (c == u'>') { 1430 // Finished the number. Add the info to the expected break data, 1431 // and switch parse state back to doing plain data. 1432 parseState = PARSE_DATA; 1433 if (tagValue == 0) { 1434 tagValue = -1; 1435 } 1436 int32_t breakIdx = tp.dataToBreak.length(); 1437 tp.expectedBreaks->setSize(breakIdx+1); 1438 tp.expectedBreaks->setElementAt(tagValue, breakIdx); 1439 tp.srcLine->setSize(breakIdx+1); 1440 tp.srcLine->setElementAt(lineNum, breakIdx); 1441 tp.srcCol ->setSize(breakIdx+1); 1442 tp.srcCol ->setElementAt(column, breakIdx); 1443 break; 1444 } 1445 1446 if (u_isdigit(c)) { 1447 tagValue = tagValue*10 + u_charDigitValue(c); 1448 break; 1449 } 1450 1451 errln("Syntax Error in test file at line %d, col %d", 1452 lineNum, column); 1453 parseState = PARSE_COMMENT; 1454 goto end_test; // Stop the test 1455 break; 1456 } 1457 1458 1459 if (U_FAILURE(status)) { 1460 errln("ICU Error %s while parsing test file at line %d.", 1461 u_errorName(status), lineNum); 1462 status = U_ZERO_ERROR; 1463 goto end_test; // Stop the test 1464 } 1465 1466 } 1467 1468 // Reached end of test file. Raise an error if parseState indicates that we are 1469 // within a block that should have been terminated. 1470 1471 if (parseState == PARSE_RULES) { 1472 errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.", 1473 lineNum, rulesFirstLine); 1474 } 1475 if (parseState == PARSE_DATA) { 1476 errln("rbbitst.txt:%d <data> block not closed.", lineNum); 1477 } 1478 1479 1480 end_test: 1481 delete [] testFile; 1482 #endif 1483 } 1484 1485 1486 //------------------------------------------------------------------------------- 1487 // 1488 // TestDictRules create a break iterator from source rules that includes a 1489 // dictionary range. Regression for bug #7130. Source rules 1490 // do not declare a break iterator type (word, line, sentence, etc. 1491 // but the dictionary code, without a type, would loop. 1492 // 1493 //------------------------------------------------------------------------------- 1494 void RBBITest::TestDictRules() { 1495 const char *rules = "$dictionary = [a-z]; \n" 1496 "!!forward; \n" 1497 "$dictionary $dictionary; \n" 1498 "!!reverse; \n" 1499 "$dictionary $dictionary; \n"; 1500 const char *text = "aa"; 1501 UErrorCode status = U_ZERO_ERROR; 1502 UParseError parseError; 1503 1504 RuleBasedBreakIterator bi(rules, parseError, status); 1505 if (U_SUCCESS(status)) { 1506 UnicodeString utext = text; 1507 bi.setText(utext); 1508 int32_t position; 1509 int32_t loops; 1510 for (loops = 0; loops<10; loops++) { 1511 position = bi.next(); 1512 if (position == RuleBasedBreakIterator::DONE) { 1513 break; 1514 } 1515 } 1516 TEST_ASSERT(loops == 1); 1517 } else { 1518 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status)); 1519 } 1520 } 1521 1522 1523 1524 //------------------------------------------------------------------------------- 1525 // 1526 // ReadAndConvertFile Read a text data file, convert it to UChars, and 1527 // return the data in one big UChar * buffer, which the caller must delete. 1528 // 1529 // parameters: 1530 // fileName: the name of the file, with no directory part. The test data directory 1531 // is assumed. 1532 // ulen an out parameter, receives the actual length (in UChars) of the file data. 1533 // encoding The file encoding. If the file contains a BOM, that will override the encoding 1534 // specified here. The BOM, if it exists, will be stripped from the returned data. 1535 // Pass NULL for the system default encoding. 1536 // status 1537 // returns: 1538 // The file data, converted to UChar. 1539 // The caller must delete this when done with 1540 // delete [] theBuffer; 1541 // 1542 // TODO: This is a clone of RegexTest::ReadAndConvertFile. 1543 // Move this function to some common place. 1544 // 1545 //-------------------------------------------------------------------------------- 1546 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) { 1547 UChar *retPtr = NULL; 1548 char *fileBuf = NULL; 1549 UConverter* conv = NULL; 1550 FILE *f = NULL; 1551 1552 ulen = 0; 1553 if (U_FAILURE(status)) { 1554 return retPtr; 1555 } 1556 1557 // 1558 // Open the file. 1559 // 1560 f = fopen(fileName, "rb"); 1561 if (f == 0) { 1562 dataerrln("Error opening test data file %s\n", fileName); 1563 status = U_FILE_ACCESS_ERROR; 1564 return NULL; 1565 } 1566 // 1567 // Read it in 1568 // 1569 int fileSize; 1570 int amt_read; 1571 1572 fseek( f, 0, SEEK_END); 1573 fileSize = ftell(f); 1574 fileBuf = new char[fileSize]; 1575 fseek(f, 0, SEEK_SET); 1576 amt_read = fread(fileBuf, 1, fileSize, f); 1577 if (amt_read != fileSize || fileSize <= 0) { 1578 errln("Error reading test data file."); 1579 goto cleanUpAndReturn; 1580 } 1581 1582 // 1583 // Look for a Unicode Signature (BOM) on the data just read 1584 // 1585 int32_t signatureLength; 1586 const char * fileBufC; 1587 const char* bomEncoding; 1588 1589 fileBufC = fileBuf; 1590 bomEncoding = ucnv_detectUnicodeSignature( 1591 fileBuf, fileSize, &signatureLength, &status); 1592 if(bomEncoding!=NULL ){ 1593 fileBufC += signatureLength; 1594 fileSize -= signatureLength; 1595 encoding = bomEncoding; 1596 } 1597 1598 // 1599 // Open a converter to take the rule file to UTF-16 1600 // 1601 conv = ucnv_open(encoding, &status); 1602 if (U_FAILURE(status)) { 1603 goto cleanUpAndReturn; 1604 } 1605 1606 // 1607 // Convert the rules to UChar. 1608 // Preflight first to determine required buffer size. 1609 // 1610 ulen = ucnv_toUChars(conv, 1611 NULL, // dest, 1612 0, // destCapacity, 1613 fileBufC, 1614 fileSize, 1615 &status); 1616 if (status == U_BUFFER_OVERFLOW_ERROR) { 1617 // Buffer Overflow is expected from the preflight operation. 1618 status = U_ZERO_ERROR; 1619 1620 retPtr = new UChar[ulen+1]; 1621 ucnv_toUChars(conv, 1622 retPtr, // dest, 1623 ulen+1, 1624 fileBufC, 1625 fileSize, 1626 &status); 1627 } 1628 1629 cleanUpAndReturn: 1630 fclose(f); 1631 delete []fileBuf; 1632 ucnv_close(conv); 1633 if (U_FAILURE(status)) { 1634 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 1635 delete []retPtr; 1636 retPtr = 0; 1637 ulen = 0; 1638 }; 1639 return retPtr; 1640 } 1641 1642 1643 1644 //-------------------------------------------------------------------------------------------- 1645 // 1646 // Run tests from each of the boundary test data files distributed by the Unicode Consortium 1647 // 1648 //------------------------------------------------------------------------------------------- 1649 void RBBITest::TestUnicodeFiles() { 1650 RuleBasedBreakIterator *bi; 1651 UErrorCode status = U_ZERO_ERROR; 1652 1653 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status); 1654 TEST_ASSERT_SUCCESS(status); 1655 if (U_SUCCESS(status)) { 1656 runUnicodeTestData("GraphemeBreakTest.txt", bi); 1657 } 1658 delete bi; 1659 1660 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status); 1661 TEST_ASSERT_SUCCESS(status); 1662 if (U_SUCCESS(status)) { 1663 runUnicodeTestData("WordBreakTest.txt", bi); 1664 } 1665 delete bi; 1666 1667 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status); 1668 TEST_ASSERT_SUCCESS(status); 1669 if (U_SUCCESS(status)) { 1670 runUnicodeTestData("SentenceBreakTest.txt", bi); 1671 } 1672 delete bi; 1673 1674 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status); 1675 TEST_ASSERT_SUCCESS(status); 1676 if (U_SUCCESS(status)) { 1677 runUnicodeTestData("LineBreakTest.txt", bi); 1678 } 1679 delete bi; 1680 } 1681 1682 1683 // Check for test cases from the Unicode test data files that are known to fail 1684 // and should be skipped because ICU is not yet able to fully implement the spec. 1685 // See ticket #7270. 1686 1687 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) { 1688 static struct TestCase { 1689 const char *fFileName; 1690 const UChar *fString; 1691 } badTestCases[] = { // Line Numbers from Unicode 7.0.0 file. 1692 {"LineBreakTest.txt", u"\u200B\u0020}"}, // Line 5198 1693 {"LineBreakTest.txt", u"\u200B\u0020)"}, // Line 5202 1694 {"LineBreakTest.txt", u"\u200B\u0020!"}, // Line 5214 1695 {"LineBreakTest.txt", u"\u200B\u0020,"}, // Line 5246 1696 {"LineBreakTest.txt", u"\u200B\u0020/"}, // Line 5298 1697 {"LineBreakTest.txt", u"\u200B\u0020\u2060"}, // Line 5302 1698 // Line Numbers from pre-release verion of GraphemeBreakTest-10.0.0.txt 1699 {"GraphemeBreakTest.txt", u"\u200D\u2640"}, // Line 656, old GB 11 test ZWJ x GAZ 1700 {"GraphemeBreakTest.txt", u"\u200D\U0001F466"}, // Line 658, old GB 11 test ZWJ x EBG 1701 {"GraphemeBreakTest.txt", u"\u200D\U0001F466\U0001F3FB"}, // Line 842, old GB 11 test ZWJ x EBG x EModifier 1702 1703 // Line Numbers from pre-release verion of WordBreakTest-10.0.0.txt 1704 {"WordBreakTest.txt", u"\u200D\u261D"}, // Line 1356, ZWJ x EmojiNRK 1705 {"WordBreakTest.txt", u"\u200D\U0001F3FB"}, // Line 1358, ZWJ x EmojiNRK 1706 }; 1707 1708 for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) { 1709 const TestCase &badCase = badTestCases[n]; 1710 if (!strcmp(fileName, badCase.fFileName) && 1711 testCase == UnicodeString(badCase.fString)) { 1712 return logKnownIssue("7270"); 1713 } 1714 } 1715 return FALSE; 1716 } 1717 1718 1719 //-------------------------------------------------------------------------------------------- 1720 // 1721 // Run tests from one of the boundary test data files distributed by the Unicode Consortium 1722 // 1723 //------------------------------------------------------------------------------------------- 1724 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) { 1725 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 1726 UErrorCode status = U_ZERO_ERROR; 1727 1728 // 1729 // Open and read the test data file, put it into a UnicodeString. 1730 // 1731 const char *testDataDirectory = IntlTest::getSourceTestData(status); 1732 char testFileName[1000]; 1733 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) { 1734 dataerrln("Can't open test data. Path too long."); 1735 return; 1736 } 1737 strcpy(testFileName, testDataDirectory); 1738 strcat(testFileName, fileName); 1739 1740 logln("Opening data file %s\n", fileName); 1741 1742 int len; 1743 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status); 1744 if (status != U_FILE_ACCESS_ERROR) { 1745 TEST_ASSERT_SUCCESS(status); 1746 TEST_ASSERT(testFile != NULL); 1747 } 1748 if (U_FAILURE(status) || testFile == NULL) { 1749 return; /* something went wrong, error already output */ 1750 } 1751 UnicodeString testFileAsString(TRUE, testFile, len); 1752 1753 // 1754 // Parse the test data file using a regular expression. 1755 // Each kind of token is recognized in its own capture group; what type of item was scanned 1756 // is identified by which group had a match. 1757 // 1758 // Caputure Group # 1 2 3 4 5 1759 // Parses this item: divide x hex digits comment \n unrecognized \n 1760 // 1761 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV); 1762 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status); 1763 UnicodeString testString; 1764 UVector32 breakPositions(status); 1765 int lineNumber = 1; 1766 TEST_ASSERT_SUCCESS(status); 1767 if (U_FAILURE(status)) { 1768 return; 1769 } 1770 1771 // 1772 // Scan through each test case, building up the string to be broken in testString, 1773 // and the positions that should be boundaries in the breakPositions vector. 1774 // 1775 int spin = 0; 1776 while (tokenMatcher.find()) { 1777 if(tokenMatcher.hitEnd()) { 1778 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for. 1779 This occurred when the text file was corrupt (wasn't marked as UTF-8) 1780 and caused an infinite loop here on EBCDIC systems! 1781 */ 1782 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin); 1783 // return; 1784 } 1785 if (tokenMatcher.start(1, status) >= 0) { 1786 // Scanned a divide sign, indicating a break position in the test data. 1787 if (testString.length()>0) { 1788 breakPositions.addElement(testString.length(), status); 1789 } 1790 } 1791 else if (tokenMatcher.start(2, status) >= 0) { 1792 // Scanned an 'x', meaning no break at this position in the test data 1793 // Nothing to be done here. 1794 } 1795 else if (tokenMatcher.start(3, status) >= 0) { 1796 // Scanned Hex digits. Convert them to binary, append to the character data string. 1797 const UnicodeString &hexNumber = tokenMatcher.group(3, status); 1798 int length = hexNumber.length(); 1799 if (length<=8) { 1800 char buf[10]; 1801 hexNumber.extract (0, length, buf, sizeof(buf), US_INV); 1802 UChar32 c = (UChar32)strtol(buf, NULL, 16); 1803 if (c<=0x10ffff) { 1804 testString.append(c); 1805 } else { 1806 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n", 1807 fileName, lineNumber); 1808 } 1809 } else { 1810 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n", 1811 fileName, lineNumber); 1812 } 1813 } 1814 else if (tokenMatcher.start(4, status) >= 0) { 1815 // Scanned to end of a line, possibly skipping over a comment in the process. 1816 // If the line from the file contained test data, run the test now. 1817 if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) { 1818 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi); 1819 } 1820 1821 // Clear out this test case. 1822 // The string and breakPositions vector will be refilled as the next 1823 // test case is parsed. 1824 testString.remove(); 1825 breakPositions.removeAllElements(); 1826 lineNumber++; 1827 } else { 1828 // Scanner catchall. Something unrecognized appeared on the line. 1829 char token[16]; 1830 UnicodeString uToken = tokenMatcher.group(0, status); 1831 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token)); 1832 token[sizeof(token)-1] = 0; 1833 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token); 1834 1835 // Clean up, in preparation for continuing with the next line. 1836 testString.remove(); 1837 breakPositions.removeAllElements(); 1838 lineNumber++; 1839 } 1840 TEST_ASSERT_SUCCESS(status); 1841 if (U_FAILURE(status)) { 1842 break; 1843 } 1844 } 1845 1846 delete [] testFile; 1847 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 1848 } 1849 1850 //-------------------------------------------------------------------------------------------- 1851 // 1852 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium 1853 // test data files. Do only a simple, forward-only check - 1854 // this test is mostly to check that ICU and the Unicode 1855 // data agree with each other. 1856 // 1857 //-------------------------------------------------------------------------------------------- 1858 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber, 1859 const UnicodeString &testString, // Text data to be broken 1860 UVector32 *breakPositions, // Positions where breaks should be found. 1861 RuleBasedBreakIterator *bi) { 1862 int32_t pos; // Break Position in the test string 1863 int32_t expectedI = 0; // Index of expected break position in the vector of expected results. 1864 int32_t expectedPos; // Expected break position (index into test string) 1865 1866 bi->setText(testString); 1867 pos = bi->first(); 1868 pos = bi->next(); 1869 1870 while (pos != BreakIterator::DONE) { 1871 if (expectedI >= breakPositions->size()) { 1872 errln("Test file \"%s\", line %d, unexpected break found at position %d", 1873 testFileName, lineNumber, pos); 1874 break; 1875 } 1876 expectedPos = breakPositions->elementAti(expectedI); 1877 if (pos < expectedPos) { 1878 errln("Test file \"%s\", line %d, unexpected break found at position %d", 1879 testFileName, lineNumber, pos); 1880 break; 1881 } 1882 if (pos > expectedPos) { 1883 errln("Test file \"%s\", line %d, failed to find expected break at position %d", 1884 testFileName, lineNumber, expectedPos); 1885 break; 1886 } 1887 pos = bi->next(); 1888 expectedI++; 1889 } 1890 1891 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) { 1892 errln("Test file \"%s\", line %d, failed to find expected break at position %d", 1893 testFileName, lineNumber, breakPositions->elementAti(expectedI)); 1894 } 1895 } 1896 1897 1898 1899 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 1900 //--------------------------------------------------------------------------------------- 1901 // 1902 // classs RBBIMonkeyKind 1903 // 1904 // Monkey Test for Break Iteration 1905 // Abstract interface class. Concrete derived classes independently 1906 // implement the break rules for different iterator types. 1907 // 1908 // The Monkey Test itself uses doesn't know which type of break iterator it is 1909 // testing, but works purely in terms of the interface defined here. 1910 // 1911 //--------------------------------------------------------------------------------------- 1912 class RBBIMonkeyKind { 1913 public: 1914 // Return a UVector of UnicodeSets, representing the character classes used 1915 // for this type of iterator. 1916 virtual UVector *charClasses() = 0; 1917 1918 // Set the test text on which subsequent calls to next() will operate 1919 virtual void setText(const UnicodeString &s) = 0; 1920 1921 // Find the next break postion, starting from the prev break position, or from zero. 1922 // Return -1 after reaching end of string. 1923 virtual int32_t next(int32_t i) = 0; 1924 1925 virtual ~RBBIMonkeyKind(); 1926 UErrorCode deferredStatus; 1927 1928 1929 protected: 1930 RBBIMonkeyKind(); 1931 1932 private: 1933 }; 1934 1935 RBBIMonkeyKind::RBBIMonkeyKind() { 1936 deferredStatus = U_ZERO_ERROR; 1937 } 1938 1939 RBBIMonkeyKind::~RBBIMonkeyKind() { 1940 } 1941 1942 1943 //---------------------------------------------------------------------------------------- 1944 // 1945 // Random Numbers. Similar to standard lib rand() and srand() 1946 // Not using library to 1947 // 1. Get same results on all platforms. 1948 // 2. Get access to current seed, to more easily reproduce failures. 1949 // 1950 //--------------------------------------------------------------------------------------- 1951 static uint32_t m_seed = 1; 1952 1953 static uint32_t m_rand() 1954 { 1955 m_seed = m_seed * 1103515245 + 12345; 1956 return (uint32_t)(m_seed/65536) % 32768; 1957 } 1958 1959 1960 // 1961 // Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267 1962 // 1963 static const char16_t *gExtended_Pict = u"[" 1964 "\\U0001F774-\\U0001F77F\\U00002700-\\U00002701\\U00002703-\\U00002704\\U0000270E\\U00002710-\\U00002711\\U00002765-\\U00002767" 1965 "\\U0001F030-\\U0001F093\\U0001F094-\\U0001F09F\\U0001F10D-\\U0001F10F\\U0001F12F\\U0001F16C-\\U0001F16F\\U0001F1AD-\\U0001F1E5" 1966 "\\U0001F260-\\U0001F265\\U0001F203-\\U0001F20F\\U0001F23C-\\U0001F23F\\U0001F249-\\U0001F24F\\U0001F252-\\U0001F25F" 1967 "\\U0001F266-\\U0001F2FF\\U0001F7D5-\\U0001F7FF\\U0001F000-\\U0001F003\\U0001F005-\\U0001F02B\\U0001F02C-\\U0001F02F" 1968 "\\U0001F322-\\U0001F323\\U0001F394-\\U0001F395\\U0001F398\\U0001F39C-\\U0001F39D\\U0001F3F1-\\U0001F3F2\\U0001F3F6" 1969 "\\U0001F4FE\\U0001F53E-\\U0001F548\\U0001F54F\\U0001F568-\\U0001F56E\\U0001F571-\\U0001F572\\U0001F57B-\\U0001F586" 1970 "\\U0001F588-\\U0001F589\\U0001F58E-\\U0001F58F\\U0001F591-\\U0001F594\\U0001F597-\\U0001F5A3\\U0001F5A6-\\U0001F5A7" 1971 "\\U0001F5A9-\\U0001F5B0\\U0001F5B3-\\U0001F5BB\\U0001F5BD-\\U0001F5C1\\U0001F5C5-\\U0001F5D0\\U0001F5D4-\\U0001F5DB" 1972 "\\U0001F5DF-\\U0001F5E0\\U0001F5E2\\U0001F5E4-\\U0001F5E7\\U0001F5E9-\\U0001F5EE\\U0001F5F0-\\U0001F5F2\\U0001F5F4-\\U0001F5F9" 1973 "\\U00002605\\U00002607-\\U0000260D\\U0000260F-\\U00002610\\U00002612\\U00002616-\\U00002617\\U00002619-\\U0000261C" 1974 "\\U0000261E-\\U0000261F\\U00002621\\U00002624-\\U00002625\\U00002627-\\U00002629\\U0000262B-\\U0000262D\\U00002630-\\U00002637" 1975 "\\U0000263B-\\U00002647\\U00002654-\\U0000265F\\U00002661-\\U00002662\\U00002664\\U00002667\\U00002669-\\U0000267A" 1976 "\\U0000267C-\\U0000267E\\U00002680-\\U00002691\\U00002695\\U00002698\\U0000269A\\U0000269D-\\U0000269F\\U000026A2-\\U000026A9" 1977 "\\U000026AC-\\U000026AF\\U000026B2-\\U000026BC\\U000026BF-\\U000026C3\\U000026C6-\\U000026C7\\U000026C9-\\U000026CD" 1978 "\\U000026D0\\U000026D2\\U000026D5-\\U000026E8\\U000026EB-\\U000026EF\\U000026F6\\U000026FB-\\U000026FC\\U000026FE-\\U000026FF" 1979 "\\U00002388\\U0001FA00-\\U0001FFFD\\U0001F0A0-\\U0001F0AE\\U0001F0B1-\\U0001F0BF\\U0001F0C1-\\U0001F0CF\\U0001F0D1-\\U0001F0F5" 1980 "\\U0001F0AF-\\U0001F0B0\\U0001F0C0\\U0001F0D0\\U0001F0F6-\\U0001F0FF\\U0001F80C-\\U0001F80F\\U0001F848-\\U0001F84F" 1981 "\\U0001F85A-\\U0001F85F\\U0001F888-\\U0001F88F\\U0001F8AE-\\U0001F8FF\\U0001F900-\\U0001F90B\\U0001F91F\\U0001F928-\\U0001F92F" 1982 "\\U0001F931-\\U0001F932\\U0001F94C\\U0001F95F-\\U0001F96B\\U0001F992-\\U0001F997\\U0001F9D0-\\U0001F9E6\\U0001F90C-\\U0001F90F" 1983 "\\U0001F93F\\U0001F94D-\\U0001F94F\\U0001F96C-\\U0001F97F\\U0001F998-\\U0001F9BF\\U0001F9C1-\\U0001F9CF\\U0001F9E7-\\U0001F9FF" 1984 "\\U0001F6C6-\\U0001F6CA\\U0001F6D3-\\U0001F6D4\\U0001F6E6-\\U0001F6E8\\U0001F6EA\\U0001F6F1-\\U0001F6F2\\U0001F6F7-\\U0001F6F8" 1985 "\\U0001F6D5-\\U0001F6DF\\U0001F6ED-\\U0001F6EF\\U0001F6F9-\\U0001F6FF" 1986 "]"; 1987 1988 //------------------------------------------------------------------------------------------ 1989 // 1990 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation 1991 // of RBBIMonkeyKind. 1992 // 1993 //------------------------------------------------------------------------------------------ 1994 class RBBICharMonkey: public RBBIMonkeyKind { 1995 public: 1996 RBBICharMonkey(); 1997 virtual ~RBBICharMonkey(); 1998 virtual UVector *charClasses(); 1999 virtual void setText(const UnicodeString &s); 2000 virtual int32_t next(int32_t i); 2001 private: 2002 UVector *fSets; 2003 2004 UnicodeSet *fCRLFSet; 2005 UnicodeSet *fControlSet; 2006 UnicodeSet *fExtendSet; 2007 UnicodeSet *fZWJSet; 2008 UnicodeSet *fRegionalIndicatorSet; 2009 UnicodeSet *fPrependSet; 2010 UnicodeSet *fSpacingSet; 2011 UnicodeSet *fLSet; 2012 UnicodeSet *fVSet; 2013 UnicodeSet *fTSet; 2014 UnicodeSet *fLVSet; 2015 UnicodeSet *fLVTSet; 2016 UnicodeSet *fHangulSet; 2017 UnicodeSet *fEmojiBaseSet; 2018 UnicodeSet *fEmojiModifierSet; 2019 UnicodeSet *fExtendedPictSet; 2020 UnicodeSet *fEBGSet; 2021 UnicodeSet *fEmojiNRKSet; 2022 UnicodeSet *fAnySet; 2023 2024 const UnicodeString *fText; 2025 }; 2026 2027 2028 RBBICharMonkey::RBBICharMonkey() { 2029 UErrorCode status = U_ZERO_ERROR; 2030 2031 fText = NULL; 2032 2033 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status); 2034 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status); 2035 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status); 2036 fZWJSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status); 2037 fRegionalIndicatorSet = 2038 new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status); 2039 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status); 2040 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status); 2041 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status); 2042 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status); 2043 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status); 2044 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status); 2045 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status); 2046 fHangulSet = new UnicodeSet(); 2047 fHangulSet->addAll(*fLSet); 2048 fHangulSet->addAll(*fVSet); 2049 fHangulSet->addAll(*fTSet); 2050 fHangulSet->addAll(*fLVSet); 2051 fHangulSet->addAll(*fLVTSet); 2052 2053 fEmojiBaseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EB}]"), status); 2054 fEmojiModifierSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EM}]"), status); 2055 fExtendedPictSet = new UnicodeSet(gExtended_Pict, status); 2056 fEBGSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EBG}]"), status); 2057 fEmojiNRKSet = new UnicodeSet(UNICODE_STRING_SIMPLE( 2058 "[[\\p{Emoji}]-[\\p{Grapheme_Cluster_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status); 2059 fAnySet = new UnicodeSet(0, 0x10ffff); 2060 2061 fSets = new UVector(status); 2062 fSets->addElement(fCRLFSet, status); 2063 fSets->addElement(fControlSet, status); 2064 fSets->addElement(fExtendSet, status); 2065 fSets->addElement(fRegionalIndicatorSet, status); 2066 if (!fPrependSet->isEmpty()) { 2067 fSets->addElement(fPrependSet, status); 2068 } 2069 fSets->addElement(fSpacingSet, status); 2070 fSets->addElement(fHangulSet, status); 2071 fSets->addElement(fAnySet, status); 2072 fSets->addElement(fEmojiBaseSet, status); 2073 fSets->addElement(fEmojiModifierSet, status); 2074 fSets->addElement(fZWJSet, status); 2075 fSets->addElement(fExtendedPictSet, status); 2076 fSets->addElement(fEBGSet, status); 2077 fSets->addElement(fEmojiNRKSet,status); 2078 if (U_FAILURE(status)) { 2079 deferredStatus = status; 2080 } 2081 } 2082 2083 2084 void RBBICharMonkey::setText(const UnicodeString &s) { 2085 fText = &s; 2086 } 2087 2088 2089 2090 int32_t RBBICharMonkey::next(int32_t prevPos) { 2091 int p0, p1, p2, p3; // Indices of the significant code points around the 2092 // break position being tested. The candidate break 2093 // location is before p2. 2094 2095 int breakPos = -1; 2096 2097 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 2098 UChar32 cBase; // for (X Extend*) patterns, the X character. 2099 2100 if (U_FAILURE(deferredStatus)) { 2101 return -1; 2102 } 2103 2104 // Previous break at end of string. return DONE. 2105 if (prevPos >= fText->length()) { 2106 return -1; 2107 } 2108 p0 = p1 = p2 = p3 = prevPos; 2109 c3 = fText->char32At(prevPos); 2110 c0 = c1 = c2 = cBase = 0; 2111 (void)p0; // suppress set but not used warning. 2112 (void)c0; 2113 2114 // Loop runs once per "significant" character position in the input text. 2115 for (;;) { 2116 // Move all of the positions forward in the input string. 2117 p0 = p1; c0 = c1; 2118 p1 = p2; c1 = c2; 2119 p2 = p3; c2 = c3; 2120 2121 // Advancd p3 by one codepoint 2122 p3 = fText->moveIndex32(p3, 1); 2123 c3 = fText->char32At(p3); 2124 2125 if (p1 == p2) { 2126 // Still warming up the loop. (won't work with zero length strings, but we don't care) 2127 continue; 2128 } 2129 if (p2 == fText->length()) { 2130 // Reached end of string. Always a break position. 2131 break; 2132 } 2133 2134 // Rule GB3 CR x LF 2135 // No Extend or Format characters may appear between the CR and LF, 2136 // which requires the additional check for p2 immediately following p1. 2137 // 2138 if (c1==0x0D && c2==0x0A && p1==(p2-1)) { 2139 continue; 2140 } 2141 2142 // Rule (GB4). ( Control | CR | LF ) <break> 2143 if (fControlSet->contains(c1) || 2144 c1 == 0x0D || 2145 c1 == 0x0A) { 2146 break; 2147 } 2148 2149 // Rule (GB5) <break> ( Control | CR | LF ) 2150 // 2151 if (fControlSet->contains(c2) || 2152 c2 == 0x0D || 2153 c2 == 0x0A) { 2154 break; 2155 } 2156 2157 2158 // Rule (GB6) L x ( L | V | LV | LVT ) 2159 if (fLSet->contains(c1) && 2160 (fLSet->contains(c2) || 2161 fVSet->contains(c2) || 2162 fLVSet->contains(c2) || 2163 fLVTSet->contains(c2))) { 2164 continue; 2165 } 2166 2167 // Rule (GB7) ( LV | V ) x ( V | T ) 2168 if ((fLVSet->contains(c1) || fVSet->contains(c1)) && 2169 (fVSet->contains(c2) || fTSet->contains(c2))) { 2170 continue; 2171 } 2172 2173 // Rule (GB8) ( LVT | T) x T 2174 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) && 2175 fTSet->contains(c2)) { 2176 continue; 2177 } 2178 2179 // Rule (GB9) x (Extend | ZWJ) 2180 if (fExtendSet->contains(c2) || fZWJSet->contains(c2)) { 2181 if (!fExtendSet->contains(c1)) { 2182 cBase = c1; 2183 } 2184 continue; 2185 } 2186 2187 // Rule (GB9a) x SpacingMark 2188 if (fSpacingSet->contains(c2)) { 2189 continue; 2190 } 2191 2192 // Rule (GB9b) Prepend x 2193 if (fPrependSet->contains(c1)) { 2194 continue; 2195 } 2196 2197 // Rule (GB10) (Emoji_Base | EBG) Extend * x Emoji_Modifier 2198 if ((fEmojiBaseSet->contains(c1) || fEBGSet->contains(c1)) && fEmojiModifierSet->contains(c2)) { 2199 continue; 2200 } 2201 if ((fEmojiBaseSet->contains(cBase) || fEBGSet->contains(cBase)) && 2202 fExtendSet->contains(c1) && fEmojiModifierSet->contains(c2)) { 2203 continue; 2204 } 2205 2206 // Rule (GB11) (Glue_After_ZWJ | Emoji) Extend * ZWJ x (Glue_After_ZWJ | Emoji) 2207 if ((fExtendedPictSet->contains(c0) || fEmojiNRKSet->contains(c0)) && fZWJSet->contains(c1) && 2208 (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) { 2209 continue; 2210 } 2211 if ((fExtendedPictSet->contains(cBase) || fEmojiNRKSet->contains(cBase)) && fExtendSet->contains(c0) && fZWJSet->contains(c1) && 2212 (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) { 2213 continue; 2214 } 2215 2216 // Rule (GB12-13) Regional_Indicator x Regional_Indicator 2217 // Note: The first if condition is a little tricky. We only need to force 2218 // a break if there are three or more contiguous RIs. If there are 2219 // only two, a break following will occur via other rules, and will include 2220 // any trailing extend characters, which is needed behavior. 2221 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1) 2222 && fRegionalIndicatorSet->contains(c2)) { 2223 break; 2224 } 2225 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) { 2226 continue; 2227 } 2228 2229 // Rule (GB999) Any <break> Any 2230 break; 2231 } 2232 2233 breakPos = p2; 2234 return breakPos; 2235 } 2236 2237 2238 2239 UVector *RBBICharMonkey::charClasses() { 2240 return fSets; 2241 } 2242 2243 2244 RBBICharMonkey::~RBBICharMonkey() { 2245 delete fSets; 2246 delete fCRLFSet; 2247 delete fControlSet; 2248 delete fExtendSet; 2249 delete fRegionalIndicatorSet; 2250 delete fPrependSet; 2251 delete fSpacingSet; 2252 delete fLSet; 2253 delete fVSet; 2254 delete fTSet; 2255 delete fLVSet; 2256 delete fLVTSet; 2257 delete fHangulSet; 2258 delete fAnySet; 2259 delete fEmojiBaseSet; 2260 delete fEmojiModifierSet; 2261 delete fZWJSet; 2262 delete fExtendedPictSet; 2263 delete fEBGSet; 2264 delete fEmojiNRKSet; 2265 } 2266 2267 //------------------------------------------------------------------------------------------ 2268 // 2269 // class RBBIWordMonkey Word Break specific implementation 2270 // of RBBIMonkeyKind. 2271 // 2272 //------------------------------------------------------------------------------------------ 2273 class RBBIWordMonkey: public RBBIMonkeyKind { 2274 public: 2275 RBBIWordMonkey(); 2276 virtual ~RBBIWordMonkey(); 2277 virtual UVector *charClasses(); 2278 virtual void setText(const UnicodeString &s); 2279 virtual int32_t next(int32_t i); 2280 private: 2281 UVector *fSets; 2282 2283 UnicodeSet *fCRSet; 2284 UnicodeSet *fLFSet; 2285 UnicodeSet *fNewlineSet; 2286 UnicodeSet *fRegionalIndicatorSet; 2287 UnicodeSet *fKatakanaSet; 2288 UnicodeSet *fHebrew_LetterSet; 2289 UnicodeSet *fALetterSet; 2290 UnicodeSet *fSingle_QuoteSet; 2291 UnicodeSet *fDouble_QuoteSet; 2292 UnicodeSet *fMidNumLetSet; 2293 UnicodeSet *fMidLetterSet; 2294 UnicodeSet *fMidNumSet; 2295 UnicodeSet *fNumericSet; 2296 UnicodeSet *fFormatSet; 2297 UnicodeSet *fOtherSet; 2298 UnicodeSet *fExtendSet; 2299 UnicodeSet *fExtendNumLetSet; 2300 UnicodeSet *fDictionarySet; 2301 UnicodeSet *fEBaseSet; 2302 UnicodeSet *fEBGSet; 2303 UnicodeSet *fEModifierSet; 2304 UnicodeSet *fZWJSet; 2305 UnicodeSet *fExtendedPictSet; 2306 UnicodeSet *fEmojiNRKSet; 2307 2308 const UnicodeString *fText; 2309 }; 2310 2311 2312 RBBIWordMonkey::RBBIWordMonkey() 2313 { 2314 UErrorCode status = U_ZERO_ERROR; 2315 2316 fSets = new UVector(status); 2317 2318 fCRSet = new UnicodeSet(u"[\\p{Word_Break = CR}]", status); 2319 fLFSet = new UnicodeSet(u"[\\p{Word_Break = LF}]", status); 2320 fNewlineSet = new UnicodeSet(u"[\\p{Word_Break = Newline}]", status); 2321 fKatakanaSet = new UnicodeSet(u"[\\p{Word_Break = Katakana}]", status); 2322 fRegionalIndicatorSet = new UnicodeSet(u"[\\p{Word_Break = Regional_Indicator}]", status); 2323 fHebrew_LetterSet = new UnicodeSet(u"[\\p{Word_Break = Hebrew_Letter}]", status); 2324 fALetterSet = new UnicodeSet(u"[\\p{Word_Break = ALetter}]", status); 2325 fSingle_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Single_Quote}]", status); 2326 fDouble_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Double_Quote}]", status); 2327 fMidNumLetSet = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]", status); 2328 fMidLetterSet = new UnicodeSet(u"[\\p{Word_Break = MidLetter}]", status); 2329 fMidNumSet = new UnicodeSet(u"[\\p{Word_Break = MidNum}]", status); 2330 fNumericSet = new UnicodeSet(u"[\\p{Word_Break = Numeric}]", status); 2331 fFormatSet = new UnicodeSet(u"[\\p{Word_Break = Format}]", status); 2332 fExtendNumLetSet = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status); 2333 fExtendSet = new UnicodeSet(u"[\\p{Word_Break = Extend}]", status); 2334 2335 fEBaseSet = new UnicodeSet(u"[\\p{Word_Break = EB}]", status); 2336 fEBGSet = new UnicodeSet(u"[\\p{Word_Break = EBG}]", status); 2337 fEModifierSet = new UnicodeSet(u"[\\p{Word_Break = EM}]", status); 2338 fZWJSet = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]", status); 2339 fExtendedPictSet = new UnicodeSet(gExtended_Pict, status); 2340 fEmojiNRKSet = new UnicodeSet( 2341 u"[[\\p{Emoji}]-[\\p{Word_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]", status); 2342 2343 fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status); 2344 fDictionarySet->addAll(*fKatakanaSet); 2345 fDictionarySet->addAll(UnicodeSet(u"[\\p{LineBreak = Complex_Context}]", status)); 2346 2347 fALetterSet->removeAll(*fDictionarySet); 2348 2349 fOtherSet = new UnicodeSet(); 2350 if(U_FAILURE(status)) { 2351 IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status)); 2352 deferredStatus = status; 2353 return; 2354 } 2355 2356 fOtherSet->complement(); 2357 fOtherSet->removeAll(*fCRSet); 2358 fOtherSet->removeAll(*fLFSet); 2359 fOtherSet->removeAll(*fNewlineSet); 2360 fOtherSet->removeAll(*fKatakanaSet); 2361 fOtherSet->removeAll(*fHebrew_LetterSet); 2362 fOtherSet->removeAll(*fALetterSet); 2363 fOtherSet->removeAll(*fSingle_QuoteSet); 2364 fOtherSet->removeAll(*fDouble_QuoteSet); 2365 fOtherSet->removeAll(*fMidLetterSet); 2366 fOtherSet->removeAll(*fMidNumSet); 2367 fOtherSet->removeAll(*fNumericSet); 2368 fOtherSet->removeAll(*fExtendNumLetSet); 2369 fOtherSet->removeAll(*fFormatSet); 2370 fOtherSet->removeAll(*fExtendSet); 2371 fOtherSet->removeAll(*fRegionalIndicatorSet); 2372 fOtherSet->removeAll(*fEBaseSet); 2373 fOtherSet->removeAll(*fEBGSet); 2374 fOtherSet->removeAll(*fEModifierSet); 2375 fOtherSet->removeAll(*fZWJSet); 2376 fOtherSet->removeAll(*fExtendedPictSet); 2377 fOtherSet->removeAll(*fEmojiNRKSet); 2378 2379 // Inhibit dictionary characters from being tested at all. 2380 fOtherSet->removeAll(*fDictionarySet); 2381 2382 fSets->addElement(fCRSet, status); 2383 fSets->addElement(fLFSet, status); 2384 fSets->addElement(fNewlineSet, status); 2385 fSets->addElement(fRegionalIndicatorSet, status); 2386 fSets->addElement(fHebrew_LetterSet, status); 2387 fSets->addElement(fALetterSet, status); 2388 fSets->addElement(fSingle_QuoteSet, status); 2389 fSets->addElement(fDouble_QuoteSet, status); 2390 //fSets->addElement(fKatakanaSet, status); // Omit Katakana from fSets, which omits Katakana characters 2391 // from the test data. They are all in the dictionary set, 2392 // which this (old, to be retired) monkey test cannot handle. 2393 fSets->addElement(fMidLetterSet, status); 2394 fSets->addElement(fMidNumLetSet, status); 2395 fSets->addElement(fMidNumSet, status); 2396 fSets->addElement(fNumericSet, status); 2397 fSets->addElement(fFormatSet, status); 2398 fSets->addElement(fExtendSet, status); 2399 fSets->addElement(fOtherSet, status); 2400 fSets->addElement(fExtendNumLetSet, status); 2401 2402 fSets->addElement(fEBaseSet, status); 2403 fSets->addElement(fEBGSet, status); 2404 fSets->addElement(fEModifierSet, status); 2405 fSets->addElement(fZWJSet, status); 2406 fSets->addElement(fExtendedPictSet, status); 2407 fSets->addElement(fEmojiNRKSet, status); 2408 2409 if (U_FAILURE(status)) { 2410 deferredStatus = status; 2411 } 2412 } 2413 2414 void RBBIWordMonkey::setText(const UnicodeString &s) { 2415 fText = &s; 2416 } 2417 2418 2419 int32_t RBBIWordMonkey::next(int32_t prevPos) { 2420 int p0, p1, p2, p3; // Indices of the significant code points around the 2421 // break position being tested. The candidate break 2422 // location is before p2. 2423 2424 int breakPos = -1; 2425 2426 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 2427 2428 if (U_FAILURE(deferredStatus)) { 2429 return -1; 2430 } 2431 2432 // Prev break at end of string. return DONE. 2433 if (prevPos >= fText->length()) { 2434 return -1; 2435 } 2436 p0 = p1 = p2 = p3 = prevPos; 2437 c3 = fText->char32At(prevPos); 2438 c0 = c1 = c2 = 0; 2439 (void)p0; // Suppress set but not used warning. 2440 2441 // Loop runs once per "significant" character position in the input text. 2442 for (;;) { 2443 // Move all of the positions forward in the input string. 2444 p0 = p1; c0 = c1; 2445 p1 = p2; c1 = c2; 2446 p2 = p3; c2 = c3; 2447 2448 // Advancd p3 by X(Extend | Format)* Rule 4 2449 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change) 2450 do { 2451 p3 = fText->moveIndex32(p3, 1); 2452 c3 = fText->char32At(p3); 2453 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { 2454 break; 2455 }; 2456 } 2457 while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3)); 2458 2459 2460 if (p1 == p2) { 2461 // Still warming up the loop. (won't work with zero length strings, but we don't care) 2462 continue; 2463 } 2464 if (p2 == fText->length()) { 2465 // Reached end of string. Always a break position. 2466 break; 2467 } 2468 2469 // Rule (3) CR x LF 2470 // No Extend or Format characters may appear between the CR and LF, 2471 // which requires the additional check for p2 immediately following p1. 2472 // 2473 if (c1==0x0D && c2==0x0A) { 2474 continue; 2475 } 2476 2477 // Rule (3a) Break before and after newlines (including CR and LF) 2478 // 2479 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) { 2480 break; 2481 }; 2482 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { 2483 break; 2484 }; 2485 2486 // Rule (3c) ZWJ x (Glue_after_ZWJ | EmojiNRK). 2487 // Not ignoring extend chars, so peek into input text to 2488 // get the potential ZWJ, the character immediately preceding c2. 2489 // Sloppy UChar32 indexing: p2-1 may reference trail half 2490 // but char32At will get the full code point. 2491 if (fZWJSet->contains(fText->char32At(p2-1)) && (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) { 2492 continue; 2493 } 2494 2495 // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter) 2496 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) && 2497 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) { 2498 continue; 2499 } 2500 2501 // Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter) 2502 // 2503 if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) && 2504 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) && 2505 (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) { 2506 continue; 2507 } 2508 2509 // Rule (7) (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter) 2510 if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) && 2511 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) && 2512 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) { 2513 continue; 2514 } 2515 2516 // Rule (7a) Hebrew_Letter x Single_Quote 2517 if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) { 2518 continue; 2519 } 2520 2521 // Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter 2522 if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) { 2523 continue; 2524 } 2525 2526 // Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter 2527 if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) { 2528 continue; 2529 } 2530 2531 // Rule (8) Numeric x Numeric 2532 if (fNumericSet->contains(c1) && 2533 fNumericSet->contains(c2)) { 2534 continue; 2535 } 2536 2537 // Rule (9) (ALetter | Hebrew_Letter) x Numeric 2538 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) && 2539 fNumericSet->contains(c2)) { 2540 continue; 2541 } 2542 2543 // Rule (10) Numeric x (ALetter | Hebrew_Letter) 2544 if (fNumericSet->contains(c1) && 2545 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) { 2546 continue; 2547 } 2548 2549 // Rule (11) Numeric (MidNum | MidNumLet | Single_Quote) x Numeric 2550 if (fNumericSet->contains(c0) && 2551 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) && 2552 fNumericSet->contains(c2)) { 2553 continue; 2554 } 2555 2556 // Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric 2557 if (fNumericSet->contains(c1) && 2558 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) && 2559 fNumericSet->contains(c3)) { 2560 continue; 2561 } 2562 2563 // Rule (13) Katakana x Katakana 2564 // Note: matches UAX 29 rules, but doesn't come into play for ICU because 2565 // all Katakana are handled by the dictionary breaker. 2566 if (fKatakanaSet->contains(c1) && 2567 fKatakanaSet->contains(c2)) { 2568 continue; 2569 } 2570 2571 // Rule 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet 2572 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) || 2573 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) && 2574 fExtendNumLetSet->contains(c2)) { 2575 continue; 2576 } 2577 2578 // Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana) 2579 if (fExtendNumLetSet->contains(c1) && 2580 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) || 2581 fNumericSet->contains(c2) || fKatakanaSet->contains(c2))) { 2582 continue; 2583 } 2584 2585 // WB 14 (E_Base | EBG) x E_Modifier 2586 if ((fEBaseSet->contains(c1) || fEBGSet->contains(c1)) && fEModifierSet->contains(c2)) { 2587 continue; 2588 } 2589 2590 // Rule 15 - 17 Group pairs of Regional Indicators. 2591 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) { 2592 break; 2593 } 2594 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) { 2595 continue; 2596 } 2597 2598 // Rule 999. Break found here. 2599 break; 2600 } 2601 2602 breakPos = p2; 2603 return breakPos; 2604 } 2605 2606 2607 UVector *RBBIWordMonkey::charClasses() { 2608 return fSets; 2609 } 2610 2611 2612 RBBIWordMonkey::~RBBIWordMonkey() { 2613 delete fSets; 2614 delete fCRSet; 2615 delete fLFSet; 2616 delete fNewlineSet; 2617 delete fKatakanaSet; 2618 delete fHebrew_LetterSet; 2619 delete fALetterSet; 2620 delete fSingle_QuoteSet; 2621 delete fDouble_QuoteSet; 2622 delete fMidNumLetSet; 2623 delete fMidLetterSet; 2624 delete fMidNumSet; 2625 delete fNumericSet; 2626 delete fFormatSet; 2627 delete fExtendSet; 2628 delete fExtendNumLetSet; 2629 delete fRegionalIndicatorSet; 2630 delete fDictionarySet; 2631 delete fOtherSet; 2632 delete fEBaseSet; 2633 delete fEBGSet; 2634 delete fEModifierSet; 2635 delete fZWJSet; 2636 delete fExtendedPictSet; 2637 delete fEmojiNRKSet; 2638 } 2639 2640 2641 2642 2643 //------------------------------------------------------------------------------------------ 2644 // 2645 // class RBBISentMonkey Sentence Break specific implementation 2646 // of RBBIMonkeyKind. 2647 // 2648 //------------------------------------------------------------------------------------------ 2649 class RBBISentMonkey: public RBBIMonkeyKind { 2650 public: 2651 RBBISentMonkey(); 2652 virtual ~RBBISentMonkey(); 2653 virtual UVector *charClasses(); 2654 virtual void setText(const UnicodeString &s); 2655 virtual int32_t next(int32_t i); 2656 private: 2657 int moveBack(int posFrom); 2658 int moveForward(int posFrom); 2659 UChar32 cAt(int pos); 2660 2661 UVector *fSets; 2662 2663 UnicodeSet *fSepSet; 2664 UnicodeSet *fFormatSet; 2665 UnicodeSet *fSpSet; 2666 UnicodeSet *fLowerSet; 2667 UnicodeSet *fUpperSet; 2668 UnicodeSet *fOLetterSet; 2669 UnicodeSet *fNumericSet; 2670 UnicodeSet *fATermSet; 2671 UnicodeSet *fSContinueSet; 2672 UnicodeSet *fSTermSet; 2673 UnicodeSet *fCloseSet; 2674 UnicodeSet *fOtherSet; 2675 UnicodeSet *fExtendSet; 2676 2677 const UnicodeString *fText; 2678 2679 }; 2680 2681 RBBISentMonkey::RBBISentMonkey() 2682 { 2683 UErrorCode status = U_ZERO_ERROR; 2684 2685 fSets = new UVector(status); 2686 2687 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator 2688 // set and made into character classes of their own. For the monkey impl, 2689 // they remain in SEP, since Sep always appears with CR and LF in the rules. 2690 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status); 2691 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status); 2692 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status); 2693 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status); 2694 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status); 2695 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status); 2696 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status); 2697 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status); 2698 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status); 2699 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status); 2700 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status); 2701 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status); 2702 fOtherSet = new UnicodeSet(); 2703 2704 if(U_FAILURE(status)) { 2705 deferredStatus = status; 2706 return; 2707 } 2708 2709 fOtherSet->complement(); 2710 fOtherSet->removeAll(*fSepSet); 2711 fOtherSet->removeAll(*fFormatSet); 2712 fOtherSet->removeAll(*fSpSet); 2713 fOtherSet->removeAll(*fLowerSet); 2714 fOtherSet->removeAll(*fUpperSet); 2715 fOtherSet->removeAll(*fOLetterSet); 2716 fOtherSet->removeAll(*fNumericSet); 2717 fOtherSet->removeAll(*fATermSet); 2718 fOtherSet->removeAll(*fSContinueSet); 2719 fOtherSet->removeAll(*fSTermSet); 2720 fOtherSet->removeAll(*fCloseSet); 2721 fOtherSet->removeAll(*fExtendSet); 2722 2723 fSets->addElement(fSepSet, status); 2724 fSets->addElement(fFormatSet, status); 2725 fSets->addElement(fSpSet, status); 2726 fSets->addElement(fLowerSet, status); 2727 fSets->addElement(fUpperSet, status); 2728 fSets->addElement(fOLetterSet, status); 2729 fSets->addElement(fNumericSet, status); 2730 fSets->addElement(fATermSet, status); 2731 fSets->addElement(fSContinueSet, status); 2732 fSets->addElement(fSTermSet, status); 2733 fSets->addElement(fCloseSet, status); 2734 fSets->addElement(fOtherSet, status); 2735 fSets->addElement(fExtendSet, status); 2736 2737 if (U_FAILURE(status)) { 2738 deferredStatus = status; 2739 } 2740 } 2741 2742 2743 2744 void RBBISentMonkey::setText(const UnicodeString &s) { 2745 fText = &s; 2746 } 2747 2748 UVector *RBBISentMonkey::charClasses() { 2749 return fSets; 2750 } 2751 2752 2753 // moveBack() Find the "significant" code point preceding the index i. 2754 // Skips over ($Extend | $Format)* . 2755 // 2756 int RBBISentMonkey::moveBack(int i) { 2757 if (i <= 0) { 2758 return -1; 2759 } 2760 UChar32 c; 2761 int32_t j = i; 2762 do { 2763 j = fText->moveIndex32(j, -1); 2764 c = fText->char32At(j); 2765 } 2766 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c))); 2767 return j; 2768 2769 } 2770 2771 2772 int RBBISentMonkey::moveForward(int i) { 2773 if (i>=fText->length()) { 2774 return fText->length(); 2775 } 2776 UChar32 c; 2777 int32_t j = i; 2778 do { 2779 j = fText->moveIndex32(j, 1); 2780 c = cAt(j); 2781 } 2782 while (fFormatSet->contains(c) || fExtendSet->contains(c)); 2783 return j; 2784 } 2785 2786 UChar32 RBBISentMonkey::cAt(int pos) { 2787 if (pos<0 || pos>=fText->length()) { 2788 return -1; 2789 } else { 2790 return fText->char32At(pos); 2791 } 2792 } 2793 2794 int32_t RBBISentMonkey::next(int32_t prevPos) { 2795 int p0, p1, p2, p3; // Indices of the significant code points around the 2796 // break position being tested. The candidate break 2797 // location is before p2. 2798 2799 int breakPos = -1; 2800 2801 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 2802 UChar32 c; 2803 2804 if (U_FAILURE(deferredStatus)) { 2805 return -1; 2806 } 2807 2808 // Prev break at end of string. return DONE. 2809 if (prevPos >= fText->length()) { 2810 return -1; 2811 } 2812 p0 = p1 = p2 = p3 = prevPos; 2813 c3 = fText->char32At(prevPos); 2814 c0 = c1 = c2 = 0; 2815 (void)p0; // Suppress set but not used warning. 2816 2817 // Loop runs once per "significant" character position in the input text. 2818 for (;;) { 2819 // Move all of the positions forward in the input string. 2820 p0 = p1; c0 = c1; 2821 p1 = p2; c1 = c2; 2822 p2 = p3; c2 = c3; 2823 2824 // Advancd p3 by X(Extend | Format)* Rule 4 2825 p3 = moveForward(p3); 2826 c3 = cAt(p3); 2827 2828 // Rule (3) CR x LF 2829 if (c1==0x0d && c2==0x0a && p2==(p1+1)) { 2830 continue; 2831 } 2832 2833 // Rule (4). Sep <break> 2834 if (fSepSet->contains(c1)) { 2835 p2 = p1+1; // Separators don't combine with Extend or Format. 2836 break; 2837 } 2838 2839 if (p2 >= fText->length()) { 2840 // Reached end of string. Always a break position. 2841 break; 2842 } 2843 2844 if (p2 == prevPos) { 2845 // Still warming up the loop. (won't work with zero length strings, but we don't care) 2846 continue; 2847 } 2848 2849 // Rule (6). ATerm x Numeric 2850 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) { 2851 continue; 2852 } 2853 2854 // Rule (7). (Upper | Lower) ATerm x Uppper 2855 if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) && 2856 fATermSet->contains(c1) && fUpperSet->contains(c2)) { 2857 continue; 2858 } 2859 2860 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower 2861 // Note: STerm | ATerm are added to the negated part of the expression by a 2862 // note to the Unicode 5.0 documents. 2863 int p8 = p1; 2864 while (fSpSet->contains(cAt(p8))) { 2865 p8 = moveBack(p8); 2866 } 2867 while (fCloseSet->contains(cAt(p8))) { 2868 p8 = moveBack(p8); 2869 } 2870 if (fATermSet->contains(cAt(p8))) { 2871 p8=p2; 2872 for (;;) { 2873 c = cAt(p8); 2874 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) || 2875 fLowerSet->contains(c) || fSepSet->contains(c) || 2876 fATermSet->contains(c) || fSTermSet->contains(c)) { 2877 break; 2878 } 2879 p8 = moveForward(p8); 2880 } 2881 if (fLowerSet->contains(cAt(p8))) { 2882 continue; 2883 } 2884 } 2885 2886 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm); 2887 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) { 2888 p8 = p1; 2889 while (fSpSet->contains(cAt(p8))) { 2890 p8 = moveBack(p8); 2891 } 2892 while (fCloseSet->contains(cAt(p8))) { 2893 p8 = moveBack(p8); 2894 } 2895 c = cAt(p8); 2896 if (fSTermSet->contains(c) || fATermSet->contains(c)) { 2897 continue; 2898 } 2899 } 2900 2901 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF) 2902 int p9 = p1; 2903 while (fCloseSet->contains(cAt(p9))) { 2904 p9 = moveBack(p9); 2905 } 2906 c = cAt(p9); 2907 if ((fSTermSet->contains(c) || fATermSet->contains(c))) { 2908 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) { 2909 continue; 2910 } 2911 } 2912 2913 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF) 2914 int p10 = p1; 2915 while (fSpSet->contains(cAt(p10))) { 2916 p10 = moveBack(p10); 2917 } 2918 while (fCloseSet->contains(cAt(p10))) { 2919 p10 = moveBack(p10); 2920 } 2921 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) { 2922 if (fSpSet->contains(c2) || fSepSet->contains(c2)) { 2923 continue; 2924 } 2925 } 2926 2927 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break> 2928 int p11 = p1; 2929 if (fSepSet->contains(cAt(p11))) { 2930 p11 = moveBack(p11); 2931 } 2932 while (fSpSet->contains(cAt(p11))) { 2933 p11 = moveBack(p11); 2934 } 2935 while (fCloseSet->contains(cAt(p11))) { 2936 p11 = moveBack(p11); 2937 } 2938 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) { 2939 break; 2940 } 2941 2942 // Rule (12) Any x Any 2943 continue; 2944 } 2945 breakPos = p2; 2946 return breakPos; 2947 } 2948 2949 RBBISentMonkey::~RBBISentMonkey() { 2950 delete fSets; 2951 delete fSepSet; 2952 delete fFormatSet; 2953 delete fSpSet; 2954 delete fLowerSet; 2955 delete fUpperSet; 2956 delete fOLetterSet; 2957 delete fNumericSet; 2958 delete fATermSet; 2959 delete fSContinueSet; 2960 delete fSTermSet; 2961 delete fCloseSet; 2962 delete fOtherSet; 2963 delete fExtendSet; 2964 } 2965 2966 2967 2968 //------------------------------------------------------------------------------------------- 2969 // 2970 // RBBILineMonkey 2971 // 2972 //------------------------------------------------------------------------------------------- 2973 2974 class RBBILineMonkey: public RBBIMonkeyKind { 2975 public: 2976 RBBILineMonkey(); 2977 virtual ~RBBILineMonkey(); 2978 virtual UVector *charClasses(); 2979 virtual void setText(const UnicodeString &s); 2980 virtual int32_t next(int32_t i); 2981 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar); 2982 private: 2983 UVector *fSets; 2984 2985 UnicodeSet *fBK; 2986 UnicodeSet *fCR; 2987 UnicodeSet *fLF; 2988 UnicodeSet *fCM; 2989 UnicodeSet *fNL; 2990 UnicodeSet *fSG; 2991 UnicodeSet *fWJ; 2992 UnicodeSet *fZW; 2993 UnicodeSet *fGL; 2994 UnicodeSet *fCB; 2995 UnicodeSet *fSP; 2996 UnicodeSet *fB2; 2997 UnicodeSet *fBA; 2998 UnicodeSet *fBB; 2999 UnicodeSet *fHY; 3000 UnicodeSet *fH2; 3001 UnicodeSet *fH3; 3002 UnicodeSet *fCL; 3003 UnicodeSet *fCP; 3004 UnicodeSet *fEX; 3005 UnicodeSet *fIN; 3006 UnicodeSet *fJL; 3007 UnicodeSet *fJV; 3008 UnicodeSet *fJT; 3009 UnicodeSet *fNS; 3010 UnicodeSet *fOP; 3011 UnicodeSet *fQU; 3012 UnicodeSet *fIS; 3013 UnicodeSet *fNU; 3014 UnicodeSet *fPO; 3015 UnicodeSet *fPR; 3016 UnicodeSet *fSY; 3017 UnicodeSet *fAI; 3018 UnicodeSet *fAL; 3019 UnicodeSet *fCJ; 3020 UnicodeSet *fHL; 3021 UnicodeSet *fID; 3022 UnicodeSet *fRI; 3023 UnicodeSet *fXX; 3024 UnicodeSet *fEB; 3025 UnicodeSet *fEM; 3026 UnicodeSet *fZJ; 3027 UnicodeSet *fExtendedPict; 3028 UnicodeSet *fEmojiNRK; 3029 3030 BreakIterator *fCharBI; 3031 const UnicodeString *fText; 3032 RegexMatcher *fNumberMatcher; 3033 }; 3034 3035 RBBILineMonkey::RBBILineMonkey() : 3036 RBBIMonkeyKind(), 3037 fSets(NULL), 3038 3039 fCharBI(NULL), 3040 fText(NULL), 3041 fNumberMatcher(NULL) 3042 3043 { 3044 if (U_FAILURE(deferredStatus)) { 3045 return; 3046 } 3047 3048 UErrorCode status = U_ZERO_ERROR; 3049 3050 fSets = new UVector(status); 3051 3052 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status); 3053 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status); 3054 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status); 3055 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status); 3056 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status); 3057 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status); 3058 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status); 3059 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status); 3060 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status); 3061 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status); 3062 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status); 3063 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status); 3064 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status); 3065 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status); 3066 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status); 3067 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status); 3068 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status); 3069 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status); 3070 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status); 3071 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status); 3072 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status); 3073 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status); 3074 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status); 3075 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status); 3076 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status); 3077 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status); 3078 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status); 3079 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status); 3080 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status); 3081 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status); 3082 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status); 3083 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status); 3084 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status); 3085 fCJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status); 3086 fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status); 3087 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status); 3088 fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status); 3089 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status); 3090 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status); 3091 fEB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EB}]"), status); 3092 fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status); 3093 fZJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status); 3094 fEmojiNRK = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Emoji}]-[\\p{Line_break=RI}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status); 3095 fExtendedPict = new UnicodeSet(gExtended_Pict, status); 3096 3097 if (U_FAILURE(status)) { 3098 deferredStatus = status; 3099 return; 3100 } 3101 3102 fAL->addAll(*fXX); // Default behavior for XX is identical to AL 3103 fAL->addAll(*fAI); // Default behavior for AI is identical to AL 3104 fAL->addAll(*fSG); // Default behavior for SG is identical to AL. 3105 3106 fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS. 3107 fCM->addAll(*fZJ); // ZWJ behaves as a CM. 3108 3109 fSets->addElement(fBK, status); 3110 fSets->addElement(fCR, status); 3111 fSets->addElement(fLF, status); 3112 fSets->addElement(fCM, status); 3113 fSets->addElement(fNL, status); 3114 fSets->addElement(fWJ, status); 3115 fSets->addElement(fZW, status); 3116 fSets->addElement(fGL, status); 3117 fSets->addElement(fCB, status); 3118 fSets->addElement(fSP, status); 3119 fSets->addElement(fB2, status); 3120 fSets->addElement(fBA, status); 3121 fSets->addElement(fBB, status); 3122 fSets->addElement(fHY, status); 3123 fSets->addElement(fH2, status); 3124 fSets->addElement(fH3, status); 3125 fSets->addElement(fCL, status); 3126 fSets->addElement(fCP, status); 3127 fSets->addElement(fEX, status); 3128 fSets->addElement(fIN, status); 3129 fSets->addElement(fJL, status); 3130 fSets->addElement(fJT, status); 3131 fSets->addElement(fJV, status); 3132 fSets->addElement(fNS, status); 3133 fSets->addElement(fOP, status); 3134 fSets->addElement(fQU, status); 3135 fSets->addElement(fIS, status); 3136 fSets->addElement(fNU, status); 3137 fSets->addElement(fPO, status); 3138 fSets->addElement(fPR, status); 3139 fSets->addElement(fSY, status); 3140 fSets->addElement(fAI, status); 3141 fSets->addElement(fAL, status); 3142 fSets->addElement(fHL, status); 3143 fSets->addElement(fID, status); 3144 fSets->addElement(fWJ, status); 3145 fSets->addElement(fRI, status); 3146 fSets->addElement(fSG, status); 3147 fSets->addElement(fEB, status); 3148 fSets->addElement(fEM, status); 3149 fSets->addElement(fZJ, status); 3150 fSets->addElement(fExtendedPict, status); 3151 fSets->addElement(fEmojiNRK, status); 3152 3153 3154 const char *rules = 3155 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?" 3156 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?" 3157 "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*" 3158 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*" 3159 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?" 3160 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"; 3161 3162 fNumberMatcher = new RegexMatcher( 3163 UnicodeString(rules, -1, US_INV), 0, status); 3164 3165 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status); 3166 3167 if (U_FAILURE(status)) { 3168 deferredStatus = status; 3169 } 3170 } 3171 3172 3173 void RBBILineMonkey::setText(const UnicodeString &s) { 3174 fText = &s; 3175 fCharBI->setText(s); 3176 fNumberMatcher->reset(s); 3177 } 3178 3179 // 3180 // rule9Adjust 3181 // Line Break TR rules 9 and 10 implementation. 3182 // This deals with combining marks and other sequences that 3183 // that must be treated as if they were something other than what they actually are. 3184 // 3185 // This is factored out into a separate function because it must be applied twice for 3186 // each potential break, once to the chars before the position being checked, then 3187 // again to the text following the possible break. 3188 // 3189 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) { 3190 if (pos == -1) { 3191 // Invalid initial position. Happens during the warmup iteration of the 3192 // main loop in next(). 3193 return; 3194 } 3195 3196 int32_t nPos = *nextPos; 3197 3198 // LB 9 Keep combining sequences together. 3199 // advance over any CM class chars. Note that Line Break CM is different 3200 // from the normal Grapheme Extend property. 3201 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d || 3202 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) { 3203 for (;;) { 3204 *nextChar = fText->char32At(nPos); 3205 if (!fCM->contains(*nextChar)) { 3206 break; 3207 } 3208 nPos = fText->moveIndex32(nPos, 1); 3209 } 3210 } 3211 3212 3213 // LB 9 Treat X CM* as if it were x. 3214 // No explicit action required. 3215 3216 // LB 10 Treat any remaining combining mark as AL 3217 if (fCM->contains(*posChar)) { 3218 *posChar = u'A'; 3219 } 3220 3221 // Push the updated nextPos and nextChar back to our caller. 3222 // This only makes a difference if posChar got bigger by consuming a 3223 // combining sequence. 3224 *nextPos = nPos; 3225 *nextChar = fText->char32At(nPos); 3226 } 3227 3228 3229 3230 int32_t RBBILineMonkey::next(int32_t startPos) { 3231 UErrorCode status = U_ZERO_ERROR; 3232 int32_t pos; // Index of the char following a potential break position 3233 UChar32 thisChar; // Character at above position "pos" 3234 3235 int32_t prevPos; // Index of the char preceding a potential break position 3236 UChar32 prevChar; // Character at above position. Note that prevChar 3237 // and thisChar may not be adjacent because combining 3238 // characters between them will be ignored. 3239 3240 int32_t prevPosX2; // Second previous character. Wider context for LB21a. 3241 UChar32 prevCharX2; 3242 3243 int32_t nextPos; // Index of the next character following pos. 3244 // Usually skips over combining marks. 3245 int32_t nextCPPos; // Index of the code point following "pos." 3246 // May point to a combining mark. 3247 int32_t tPos; // temp value. 3248 UChar32 c; 3249 3250 if (U_FAILURE(deferredStatus)) { 3251 return -1; 3252 } 3253 3254 if (startPos >= fText->length()) { 3255 return -1; 3256 } 3257 3258 3259 // Initial values for loop. Loop will run the first time without finding breaks, 3260 // while the invalid values shift out and the "this" and 3261 // "prev" positions are filled in with good values. 3262 pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration. 3263 thisChar = prevChar = prevCharX2 = 0; 3264 nextPos = nextCPPos = startPos; 3265 3266 3267 // Loop runs once per position in the test text, until a break position 3268 // is found. 3269 for (;;) { 3270 prevPosX2 = prevPos; 3271 prevCharX2 = prevChar; 3272 3273 prevPos = pos; 3274 prevChar = thisChar; 3275 3276 pos = nextPos; 3277 thisChar = fText->char32At(pos); 3278 3279 nextCPPos = fText->moveIndex32(pos, 1); 3280 nextPos = nextCPPos; 3281 3282 // Rule LB2 - Break at end of text. 3283 if (pos >= fText->length()) { 3284 break; 3285 } 3286 3287 // Rule LB 9 - adjust for combining sequences. 3288 // We do this one out-of-order because the adjustment does not change anything 3289 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to 3290 // be applied. 3291 rule9Adjust(prevPos, &prevChar, &pos, &thisChar); 3292 nextCPPos = nextPos = fText->moveIndex32(pos, 1); 3293 c = fText->char32At(nextPos); 3294 rule9Adjust(pos, &thisChar, &nextPos, &c); 3295 3296 // If the loop is still warming up - if we haven't shifted the initial 3297 // -1 positions out of prevPos yet - loop back to advance the 3298 // position in the input without any further looking for breaks. 3299 if (prevPos == -1) { 3300 continue; 3301 } 3302 3303 // LB 4 Always break after hard line breaks, 3304 if (fBK->contains(prevChar)) { 3305 break; 3306 } 3307 3308 // LB 5 Break after CR, LF, NL, but not inside CR LF 3309 if (prevChar == 0x0d && thisChar == 0x0a) { 3310 continue; 3311 } 3312 if (prevChar == 0x0d || 3313 prevChar == 0x0a || 3314 prevChar == 0x85) { 3315 break; 3316 } 3317 3318 // LB 6 Don't break before hard line breaks 3319 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 || 3320 fBK->contains(thisChar)) { 3321 continue; 3322 } 3323 3324 3325 // LB 7 Don't break before spaces or zero-width space. 3326 if (fSP->contains(thisChar)) { 3327 continue; 3328 } 3329 3330 if (fZW->contains(thisChar)) { 3331 continue; 3332 } 3333 3334 // LB 8 Break after zero width space 3335 if (fZW->contains(prevChar)) { 3336 break; 3337 } 3338 3339 // LB 8a ZWJ x (ID | ExtendedPict | Emoji) 3340 // The monkey test's way of ignoring combining characters doesn't work 3341 // for this rule. ZJ is also a CM. Need to get the actual character 3342 // preceding "thisChar", not ignoring combining marks, possibly ZJ. 3343 { 3344 int32_t prevIdx = fText->moveIndex32(pos, -1); 3345 UChar32 prevC = fText->char32At(prevIdx); 3346 if (fZJ->contains(prevC) && (fID->contains(thisChar) || fExtendedPict->contains(thisChar) || fEmojiNRK->contains(thisChar))) { 3347 continue; 3348 } 3349 } 3350 3351 // LB 9, 10 Already done, at top of loop. 3352 // 3353 3354 3355 // LB 11 Do not break before or after WORD JOINER and related characters. 3356 // x WJ 3357 // WJ x 3358 // 3359 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) { 3360 continue; 3361 } 3362 3363 // LB 12 3364 // GL x 3365 if (fGL->contains(prevChar)) { 3366 continue; 3367 } 3368 3369 // LB 12a 3370 // [^SP BA HY] x GL 3371 if (!(fSP->contains(prevChar) || 3372 fBA->contains(prevChar) || 3373 fHY->contains(prevChar) ) && fGL->contains(thisChar)) { 3374 continue; 3375 } 3376 3377 3378 3379 // LB 13 Don't break before closings. 3380 // NU x CL, NU x CP and NU x IS are not matched here so that they will 3381 // fall into LB 17 and the more general number regular expression. 3382 // 3383 if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) || 3384 (!fNU->contains(prevChar) && fCP->contains(thisChar)) || 3385 fEX->contains(thisChar) || 3386 (!fNU->contains(prevChar) && fIS->contains(thisChar)) || 3387 (!fNU->contains(prevChar) && fSY->contains(thisChar))) { 3388 continue; 3389 } 3390 3391 // LB 14 Don't break after OP SP* 3392 // Scan backwards, checking for this sequence. 3393 // The OP char could include combining marks, so we actually check for 3394 // OP CM* SP* 3395 // Another Twist: The Rule 67 fixes may have changed a SP CM 3396 // sequence into a ID char, so before scanning back through spaces, 3397 // verify that prevChar is indeed a space. The prevChar variable 3398 // may differ from fText[prevPos] 3399 tPos = prevPos; 3400 if (fSP->contains(prevChar)) { 3401 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) { 3402 tPos=fText->moveIndex32(tPos, -1); 3403 } 3404 } 3405 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { 3406 tPos=fText->moveIndex32(tPos, -1); 3407 } 3408 if (fOP->contains(fText->char32At(tPos))) { 3409 continue; 3410 } 3411 3412 3413 // LB 15 QU SP* x OP 3414 if (fOP->contains(thisChar)) { 3415 // Scan backwards from prevChar to see if it is preceded by QU CM* SP* 3416 int tPos = prevPos; 3417 while (tPos>0 && fSP->contains(fText->char32At(tPos))) { 3418 tPos = fText->moveIndex32(tPos, -1); 3419 } 3420 while (tPos>0 && fCM->contains(fText->char32At(tPos))) { 3421 tPos = fText->moveIndex32(tPos, -1); 3422 } 3423 if (fQU->contains(fText->char32At(tPos))) { 3424 continue; 3425 } 3426 } 3427 3428 3429 3430 // LB 16 (CL | CP) SP* x NS 3431 // Scan backwards for SP* CM* (CL | CP) 3432 if (fNS->contains(thisChar)) { 3433 int tPos = prevPos; 3434 while (tPos>0 && fSP->contains(fText->char32At(tPos))) { 3435 tPos = fText->moveIndex32(tPos, -1); 3436 } 3437 while (tPos>0 && fCM->contains(fText->char32At(tPos))) { 3438 tPos = fText->moveIndex32(tPos, -1); 3439 } 3440 if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) { 3441 continue; 3442 } 3443 } 3444 3445 3446 // LB 17 B2 SP* x B2 3447 if (fB2->contains(thisChar)) { 3448 // Scan backwards, checking for the B2 CM* SP* sequence. 3449 tPos = prevPos; 3450 if (fSP->contains(prevChar)) { 3451 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) { 3452 tPos=fText->moveIndex32(tPos, -1); 3453 } 3454 } 3455 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { 3456 tPos=fText->moveIndex32(tPos, -1); 3457 } 3458 if (fB2->contains(fText->char32At(tPos))) { 3459 continue; 3460 } 3461 } 3462 3463 3464 // LB 18 break after space 3465 if (fSP->contains(prevChar)) { 3466 break; 3467 } 3468 3469 // LB 19 3470 // x QU 3471 // QU x 3472 if (fQU->contains(thisChar) || fQU->contains(prevChar)) { 3473 continue; 3474 } 3475 3476 // LB 20 Break around a CB 3477 if (fCB->contains(thisChar) || fCB->contains(prevChar)) { 3478 break; 3479 } 3480 3481 // LB 21 3482 if (fBA->contains(thisChar) || 3483 fHY->contains(thisChar) || 3484 fNS->contains(thisChar) || 3485 fBB->contains(prevChar) ) { 3486 continue; 3487 } 3488 3489 // LB 21a 3490 // HL (HY | BA) x 3491 if (fHL->contains(prevCharX2) && 3492 (fHY->contains(prevChar) || fBA->contains(prevChar))) { 3493 continue; 3494 } 3495 3496 // LB 21b 3497 // SY x HL 3498 if (fSY->contains(prevChar) && fHL->contains(thisChar)) { 3499 continue; 3500 } 3501 3502 // LB 22 3503 if ((fAL->contains(prevChar) && fIN->contains(thisChar)) || 3504 (fEX->contains(prevChar) && fIN->contains(thisChar)) || 3505 (fHL->contains(prevChar) && fIN->contains(thisChar)) || 3506 ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) && fIN->contains(thisChar)) || 3507 (fIN->contains(prevChar) && fIN->contains(thisChar)) || 3508 (fNU->contains(prevChar) && fIN->contains(thisChar)) ) { 3509 continue; 3510 } 3511 3512 3513 // LB 23 (AL | HL) x NU 3514 // NU x (AL | HL) 3515 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) { 3516 continue; 3517 } 3518 if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) { 3519 continue; 3520 } 3521 3522 // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes. 3523 // PR x (ID | EB | EM) 3524 // (ID | EB | EM) x PO 3525 if (fPR->contains(prevChar) && 3526 (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar))) { 3527 continue; 3528 } 3529 if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) && 3530 fPO->contains(thisChar)) { 3531 continue; 3532 } 3533 3534 // LB 24 Do not break between prefix and letters or ideographs. 3535 // (PR | PO) x (AL | HL) 3536 // (AL | HL) x (PR | PO) 3537 if ((fPR->contains(prevChar) || fPO->contains(prevChar)) && 3538 (fAL->contains(thisChar) || fHL->contains(thisChar))) { 3539 continue; 3540 } 3541 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && 3542 (fPR->contains(thisChar) || fPO->contains(thisChar))) { 3543 continue; 3544 } 3545 3546 3547 3548 // LB 25 Numbers 3549 if (fNumberMatcher->lookingAt(prevPos, status)) { 3550 if (U_FAILURE(status)) { 3551 break; 3552 } 3553 // Matched a number. But could have been just a single digit, which would 3554 // not represent a "no break here" between prevChar and thisChar 3555 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num 3556 if (numEndIdx > pos) { 3557 // Number match includes at least our two chars being checked 3558 if (numEndIdx > nextPos) { 3559 // Number match includes additional chars. Update pos and nextPos 3560 // so that next loop iteration will continue at the end of the number, 3561 // checking for breaks between last char in number & whatever follows. 3562 pos = nextPos = numEndIdx; 3563 do { 3564 pos = fText->moveIndex32(pos, -1); 3565 thisChar = fText->char32At(pos); 3566 } while (fCM->contains(thisChar)); 3567 } 3568 continue; 3569 } 3570 } 3571 3572 3573 // LB 26 Do not break a Korean syllable. 3574 if (fJL->contains(prevChar) && (fJL->contains(thisChar) || 3575 fJV->contains(thisChar) || 3576 fH2->contains(thisChar) || 3577 fH3->contains(thisChar))) { 3578 continue; 3579 } 3580 3581 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) && 3582 (fJV->contains(thisChar) || fJT->contains(thisChar))) { 3583 continue; 3584 } 3585 3586 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) && 3587 fJT->contains(thisChar)) { 3588 continue; 3589 } 3590 3591 // LB 27 Treat a Korean Syllable Block the same as ID. 3592 if ((fJL->contains(prevChar) || fJV->contains(prevChar) || 3593 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) && 3594 fIN->contains(thisChar)) { 3595 continue; 3596 } 3597 if ((fJL->contains(prevChar) || fJV->contains(prevChar) || 3598 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) && 3599 fPO->contains(thisChar)) { 3600 continue; 3601 } 3602 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) || 3603 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) { 3604 continue; 3605 } 3606 3607 3608 3609 // LB 28 Do not break between alphabetics ("at"). 3610 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) { 3611 continue; 3612 } 3613 3614 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g."). 3615 if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) { 3616 continue; 3617 } 3618 3619 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation. 3620 // (AL | NU) x OP 3621 // CP x (AL | NU) 3622 if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) { 3623 continue; 3624 } 3625 if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) { 3626 continue; 3627 } 3628 3629 // LB30a RI RI <break> RI 3630 // RI x RI 3631 if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) { 3632 break; 3633 } 3634 if (fRI->contains(prevChar) && fRI->contains(thisChar)) { 3635 continue; 3636 } 3637 3638 // LB30b Emoji Base x Emoji Modifier 3639 if (fEB->contains(prevChar) && fEM->contains(thisChar)) { 3640 continue; 3641 } 3642 3643 // LB 31 Break everywhere else 3644 break; 3645 3646 } 3647 3648 return pos; 3649 } 3650 3651 3652 UVector *RBBILineMonkey::charClasses() { 3653 return fSets; 3654 } 3655 3656 3657 RBBILineMonkey::~RBBILineMonkey() { 3658 delete fSets; 3659 3660 delete fBK; 3661 delete fCR; 3662 delete fLF; 3663 delete fCM; 3664 delete fNL; 3665 delete fWJ; 3666 delete fZW; 3667 delete fGL; 3668 delete fCB; 3669 delete fSP; 3670 delete fB2; 3671 delete fBA; 3672 delete fBB; 3673 delete fHY; 3674 delete fH2; 3675 delete fH3; 3676 delete fCL; 3677 delete fCP; 3678 delete fEX; 3679 delete fIN; 3680 delete fJL; 3681 delete fJV; 3682 delete fJT; 3683 delete fNS; 3684 delete fOP; 3685 delete fQU; 3686 delete fIS; 3687 delete fNU; 3688 delete fPO; 3689 delete fPR; 3690 delete fSY; 3691 delete fAI; 3692 delete fAL; 3693 delete fCJ; 3694 delete fHL; 3695 delete fID; 3696 delete fRI; 3697 delete fSG; 3698 delete fXX; 3699 delete fEB; 3700 delete fEM; 3701 delete fZJ; 3702 delete fExtendedPict; 3703 delete fEmojiNRK; 3704 3705 delete fCharBI; 3706 delete fNumberMatcher; 3707 } 3708 3709 3710 //------------------------------------------------------------------------------------------- 3711 // 3712 // TestMonkey 3713 // 3714 // params 3715 // seed=nnnnn Random number starting seed. 3716 // Setting the seed allows errors to be reproduced. 3717 // loop=nnn Looping count. Controls running time. 3718 // -1: run forever. 3719 // 0 or greater: run length. 3720 // 3721 // type = char | word | line | sent | title 3722 // 3723 // Example: 3724 // intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1" 3725 // 3726 //------------------------------------------------------------------------------------------- 3727 3728 static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) { 3729 int32_t val = defaultVal; 3730 name.append(" *= *(-?\\d+)"); 3731 UErrorCode status = U_ZERO_ERROR; 3732 RegexMatcher m(name, params, 0, status); 3733 if (m.find()) { 3734 // The param exists. Convert the string to an int. 3735 char valString[100]; 3736 int32_t paramLength = m.end(1, status) - m.start(1, status); 3737 if (paramLength >= (int32_t)(sizeof(valString)-1)) { 3738 paramLength = (int32_t)(sizeof(valString)-2); 3739 } 3740 params.extract(m.start(1, status), paramLength, valString, sizeof(valString)); 3741 val = strtol(valString, NULL, 10); 3742 3743 // Delete this parameter from the params string. 3744 m.reset(); 3745 params = m.replaceFirst("", status); 3746 } 3747 U_ASSERT(U_SUCCESS(status)); 3748 return val; 3749 } 3750 #endif 3751 3752 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 3753 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr, 3754 BreakIterator *bi, 3755 int expected[], 3756 int expectedcount) 3757 { 3758 int count = 0; 3759 int i = 0; 3760 int forward[50]; 3761 bi->setText(ustr); 3762 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) { 3763 forward[count] = i; 3764 if (count < expectedcount && expected[count] != i) { 3765 test->errln("%s:%d break forward test failed: expected %d but got %d", 3766 __FILE__, __LINE__, expected[count], i); 3767 break; 3768 } 3769 count ++; 3770 } 3771 if (count != expectedcount) { 3772 printStringBreaks(ustr, expected, expectedcount); 3773 test->errln("%s:%d break forward test failed: missed %d match", 3774 __FILE__, __LINE__, expectedcount - count); 3775 return; 3776 } 3777 // testing boundaries 3778 for (i = 1; i < expectedcount; i ++) { 3779 int j = expected[i - 1]; 3780 if (!bi->isBoundary(j)) { 3781 printStringBreaks(ustr, expected, expectedcount); 3782 test->errln("%s:%d isBoundary() failed. Expected boundary at position %d", 3783 __FILE__, __LINE__, j); 3784 return; 3785 } 3786 for (j = expected[i - 1] + 1; j < expected[i]; j ++) { 3787 if (bi->isBoundary(j)) { 3788 printStringBreaks(ustr, expected, expectedcount); 3789 test->errln("%s:%d isBoundary() failed. Not expecting boundary at position %d", 3790 __FILE__, __LINE__, j); 3791 return; 3792 } 3793 } 3794 } 3795 3796 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) { 3797 count --; 3798 if (forward[count] != i) { 3799 printStringBreaks(ustr, expected, expectedcount); 3800 test->errln("%s:%d happy break test previous() failed: expected %d but got %d", 3801 __FILE__, __LINE__, forward[count], i); 3802 break; 3803 } 3804 } 3805 if (count != 0) { 3806 printStringBreaks(ustr, expected, expectedcount); 3807 test->errln("break test previous() failed: missed a match"); 3808 return; 3809 } 3810 3811 // testing preceding 3812 for (i = 0; i < expectedcount - 1; i ++) { 3813 // int j = expected[i] + 1; 3814 int j = ustr.moveIndex32(expected[i], 1); 3815 for (; j <= expected[i + 1]; j ++) { 3816 int32_t expectedPreceding = expected[i]; 3817 int32_t actualPreceding = bi->preceding(j); 3818 if (actualPreceding != expectedPreceding) { 3819 printStringBreaks(ustr, expected, expectedcount); 3820 test->errln("%s:%d preceding(%d): expected %d, got %d", 3821 __FILE__, __LINE__, j, expectedPreceding, actualPreceding); 3822 return; 3823 } 3824 } 3825 } 3826 } 3827 #endif 3828 3829 void RBBITest::TestWordBreaks(void) 3830 { 3831 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 3832 3833 Locale locale("en"); 3834 UErrorCode status = U_ZERO_ERROR; 3835 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 3836 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 3837 // Replaced any C+J characters in a row with a random sequence of characters 3838 // of the same length to make our C+J segmentation not get in the way. 3839 static const char *strlist[] = 3840 { 3841 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d", 3842 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b", 3843 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a", 3844 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622", 3845 "\\uac00\\u3588\\u009c\\u0953\\u194b", 3846 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 3847 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e", 3848 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e", 3849 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 3850 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 3851 "\\u2027\\U000e0067\\u0a47\\u00b7", 3852 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 3853 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 3854 "\\u0589\\U000e006e\\u0a42\\U000104a5", 3855 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a", 3856 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 3857 "\\u0027\\u11af\\U000e0057\\u0602", 3858 "\\U0001d7f2\\U000e007\\u0004\\u0589", 3859 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b", 3860 "\\U0001d7f2\\U000e007d\\u0004\\u0589", 3861 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 3862 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 3863 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068", 3864 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 3865 "\\u0233\\U000e0020\\u0a69\\u0d6a", 3866 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 3867 "\\u18f4\\U000e0049\\u20e7\\u2027", 3868 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 3869 "\\ua183\\u102d\\u0bec\\u003a", 3870 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 3871 "\\u003a\\u0e57\\u0fad\\u002e", 3872 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", 3873 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", 3874 "\\U000e005d\\u2044\\u0731\\u0650\\u0061", 3875 "\\u003a\\u0664\\u00b7\\u1fba", 3876 "\\u003b\\u0027\\u00b7\\u47a3", 3877 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b", 3878 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673", 3879 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c", 3880 }; 3881 int loop; 3882 if (U_FAILURE(status)) { 3883 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 3884 return; 3885 } 3886 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) { 3887 // printf("looping %d\n", loop); 3888 UnicodeString ustr = CharsToUnicodeString(strlist[loop]); 3889 // RBBICharMonkey monkey; 3890 RBBIWordMonkey monkey; 3891 3892 int expected[50]; 3893 int expectedcount = 0; 3894 3895 monkey.setText(ustr); 3896 int i; 3897 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 3898 expected[expectedcount ++] = i; 3899 } 3900 3901 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 3902 } 3903 delete bi; 3904 #endif 3905 } 3906 3907 void RBBITest::TestWordBoundary(void) 3908 { 3909 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data> 3910 Locale locale("en"); 3911 UErrorCode status = U_ZERO_ERROR; 3912 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 3913 LocalPointer<BreakIterator> bi(BreakIterator::createWordInstance(locale, status), status); 3914 if (U_FAILURE(status)) { 3915 errcheckln(status, "%s:%d Creation of break iterator failed %s", 3916 __FILE__, __LINE__, u_errorName(status)); 3917 return; 3918 } 3919 UChar str[50]; 3920 static const char *strlist[] = 3921 { 3922 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 3923 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 3924 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 3925 "\\u2027\\U000e0067\\u0a47\\u00b7", 3926 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 3927 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 3928 "\\u0589\\U000e006e\\u0a42\\U000104a5", 3929 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a", 3930 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 3931 "\\u0027\\u11af\\U000e0057\\u0602", 3932 "\\U0001d7f2\\U000e007\\u0004\\u0589", 3933 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b", 3934 "\\U0001d7f2\\U000e007d\\u0004\\u0589", 3935 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 3936 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 3937 "\\U000e0065\\u302c\\u09ee\\U000e0068", 3938 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 3939 "\\u0233\\U000e0020\\u0a69\\u0d6a", 3940 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 3941 "\\u58f4\\U000e0049\\u20e7\\u2027", 3942 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 3943 "\\ua183\\u102d\\u0bec\\u003a", 3944 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 3945 "\\u003a\\u0e57\\u0fad\\u002e", 3946 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", 3947 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", 3948 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019", 3949 "\\u003a\\u0664\\u00b7\\u1fba", 3950 "\\u003b\\u0027\\u00b7\\u47a3", 3951 }; 3952 int loop; 3953 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) { 3954 u_unescape(strlist[loop], str, UPRV_LENGTHOF(str)); 3955 UnicodeString ustr(str); 3956 int forward[50]; 3957 int count = 0; 3958 3959 bi->setText(ustr); 3960 int prev = -1; 3961 for (int32_t boundary = bi->first(); boundary != BreakIterator::DONE; boundary = bi->next()) { 3962 ++count; 3963 if (count >= UPRV_LENGTHOF(forward)) { 3964 errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)", 3965 __FILE__, __LINE__, loop, count, boundary); 3966 return; 3967 } 3968 forward[count] = boundary; 3969 if (boundary <= prev) { 3970 errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n", 3971 __FILE__, __LINE__, loop, prev, boundary); 3972 break; 3973 } 3974 for (int32_t nonBoundary = prev + 1; nonBoundary < boundary; nonBoundary ++) { 3975 if (bi->isBoundary(nonBoundary)) { 3976 printStringBreaks(ustr, forward, count); 3977 errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)", 3978 __FILE__, __LINE__, loop, prev, nonBoundary, boundary); 3979 return; 3980 } 3981 } 3982 if (!bi->isBoundary(boundary)) { 3983 printStringBreaks(ustr, forward, count); 3984 errln("%s:%d happy boundary test failed: expected %d a boundary", 3985 __FILE__, __LINE__, boundary); 3986 return; 3987 } 3988 prev = boundary; 3989 } 3990 } 3991 } 3992 3993 void RBBITest::TestLineBreaks(void) 3994 { 3995 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 3996 Locale locale("en"); 3997 UErrorCode status = U_ZERO_ERROR; 3998 BreakIterator *bi = BreakIterator::createLineInstance(locale, status); 3999 const int32_t STRSIZE = 50; 4000 UChar str[STRSIZE]; 4001 static const char *strlist[] = 4002 { 4003 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc", 4004 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\" 4005 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d", 4006 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\" 4007 "u2014\\U000e0105\\u118c\\u000a\\u07f8", 4008 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f", 4009 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", 4010 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4", 4011 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", 4012 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060", 4013 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f", 4014 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1", 4015 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5", 4016 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0", 4017 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc", 4018 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f", 4019 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f", 4020 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b", 4021 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085", 4022 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac", 4023 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9", 4024 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025", 4025 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763", 4026 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029", 4027 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7", 4028 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a", 4029 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945", 4030 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014", 4031 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b", 4032 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0", 4033 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d", 4034 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111", 4035 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f", 4036 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010", 4037 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43", 4038 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb", 4039 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc", 4040 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060", 4041 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0", 4042 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07", 4043 }; 4044 int loop; 4045 TEST_ASSERT_SUCCESS(status); 4046 if (U_FAILURE(status)) { 4047 return; 4048 } 4049 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) { 4050 // printf("looping %d\n", loop); 4051 int32_t t = u_unescape(strlist[loop], str, STRSIZE); 4052 if (t >= STRSIZE) { 4053 TEST_ASSERT(FALSE); 4054 continue; 4055 } 4056 4057 4058 UnicodeString ustr(str); 4059 RBBILineMonkey monkey; 4060 if (U_FAILURE(monkey.deferredStatus)) { 4061 continue; 4062 } 4063 4064 const int EXPECTEDSIZE = 50; 4065 int expected[EXPECTEDSIZE]; 4066 int expectedcount = 0; 4067 4068 monkey.setText(ustr); 4069 int i; 4070 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 4071 if (expectedcount >= EXPECTEDSIZE) { 4072 TEST_ASSERT(expectedcount < EXPECTEDSIZE); 4073 return; 4074 } 4075 expected[expectedcount ++] = i; 4076 } 4077 4078 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 4079 } 4080 delete bi; 4081 #endif 4082 } 4083 4084 void RBBITest::TestSentBreaks(void) 4085 { 4086 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 4087 Locale locale("en"); 4088 UErrorCode status = U_ZERO_ERROR; 4089 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status); 4090 UChar str[200]; 4091 static const char *strlist[] = 4092 { 4093 "Now\ris\nthe\r\ntime\n\rfor\r\r", 4094 "This\n", 4095 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.", 4096 "\"Sentence ending with a quote.\" Bye.", 4097 " (This is it). Testing the sentence iterator. \"This isn't it.\"", 4098 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"", 4099 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ", 4100 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ", 4101 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ", 4102 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!", 4103 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52" 4104 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a" 4105 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f" 4106 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352", 4107 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171" 4108 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030" 4109 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b" 4110 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b" 4111 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05" 4112 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4" 4113 }; 4114 int loop; 4115 if (U_FAILURE(status)) { 4116 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 4117 return; 4118 } 4119 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) { 4120 u_unescape(strlist[loop], str, UPRV_LENGTHOF(str)); 4121 UnicodeString ustr(str); 4122 4123 RBBISentMonkey monkey; 4124 if (U_FAILURE(monkey.deferredStatus)) { 4125 continue; 4126 } 4127 4128 const int EXPECTEDSIZE = 50; 4129 int expected[EXPECTEDSIZE]; 4130 int expectedcount = 0; 4131 4132 monkey.setText(ustr); 4133 int i; 4134 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 4135 if (expectedcount >= EXPECTEDSIZE) { 4136 TEST_ASSERT(expectedcount < EXPECTEDSIZE); 4137 return; 4138 } 4139 expected[expectedcount ++] = i; 4140 } 4141 4142 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 4143 } 4144 delete bi; 4145 #endif 4146 } 4147 4148 void RBBITest::TestMonkey() { 4149 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 4150 4151 UErrorCode status = U_ZERO_ERROR; 4152 int32_t loopCount = 500; 4153 int32_t seed = 1; 4154 UnicodeString breakType = "all"; 4155 Locale locale("en"); 4156 UBool useUText = FALSE; 4157 4158 if (quick == FALSE) { 4159 loopCount = 10000; 4160 } 4161 4162 if (fTestParams) { 4163 UnicodeString p(fTestParams); 4164 loopCount = getIntParam("loop", p, loopCount); 4165 seed = getIntParam("seed", p, seed); 4166 4167 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status); 4168 if (m.find()) { 4169 breakType = m.group(1, status); 4170 m.reset(); 4171 p = m.replaceFirst("", status); 4172 } 4173 4174 RegexMatcher u(" *utext", p, 0, status); 4175 if (u.find()) { 4176 useUText = TRUE; 4177 u.reset(); 4178 p = u.replaceFirst("", status); 4179 } 4180 4181 4182 // m.reset(p); 4183 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) { 4184 // Each option is stripped out of the option string as it is processed. 4185 // All options have been checked. The option string should have been completely emptied.. 4186 char buf[100]; 4187 p.extract(buf, sizeof(buf), NULL, status); 4188 buf[sizeof(buf)-1] = 0; 4189 errln("Unrecognized or extra parameter: %s\n", buf); 4190 return; 4191 } 4192 4193 } 4194 4195 if (breakType == "char" || breakType == "all") { 4196 RBBICharMonkey m; 4197 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 4198 if (U_SUCCESS(status)) { 4199 RunMonkey(bi, m, "char", seed, loopCount, useUText); 4200 if (breakType == "all" && useUText==FALSE) { 4201 // Also run a quick test with UText when "all" is specified 4202 RunMonkey(bi, m, "char", seed, loopCount, TRUE); 4203 } 4204 } 4205 else { 4206 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status)); 4207 } 4208 delete bi; 4209 } 4210 4211 if (breakType == "word" || breakType == "all") { 4212 logln("Word Break Monkey Test"); 4213 RBBIWordMonkey m; 4214 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 4215 if (U_SUCCESS(status)) { 4216 RunMonkey(bi, m, "word", seed, loopCount, useUText); 4217 } 4218 else { 4219 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status)); 4220 } 4221 delete bi; 4222 } 4223 4224 if (breakType == "line" || breakType == "all") { 4225 logln("Line Break Monkey Test"); 4226 RBBILineMonkey m; 4227 BreakIterator *bi = BreakIterator::createLineInstance(locale, status); 4228 if (loopCount >= 10) { 4229 loopCount = loopCount / 5; // Line break runs slower than the others. 4230 } 4231 if (U_SUCCESS(status)) { 4232 RunMonkey(bi, m, "line", seed, loopCount, useUText); 4233 } 4234 else { 4235 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); 4236 } 4237 delete bi; 4238 } 4239 4240 if (breakType == "sent" || breakType == "all" ) { 4241 logln("Sentence Break Monkey Test"); 4242 RBBISentMonkey m; 4243 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status); 4244 if (loopCount >= 10) { 4245 loopCount = loopCount / 10; // Sentence runs slower than the other break types 4246 } 4247 if (U_SUCCESS(status)) { 4248 RunMonkey(bi, m, "sentence", seed, loopCount, useUText); 4249 } 4250 else { 4251 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); 4252 } 4253 delete bi; 4254 } 4255 4256 #endif 4257 } 4258 4259 // 4260 // Run a RBBI monkey test. Common routine, for all break iterator types. 4261 // Parameters: 4262 // bi - the break iterator to use 4263 // mk - MonkeyKind, abstraction for obtaining expected results 4264 // name - Name of test (char, word, etc.) for use in error messages 4265 // seed - Seed for starting random number generator (parameter from user) 4266 // numIterations 4267 // 4268 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed, 4269 int32_t numIterations, UBool useUText) { 4270 4271 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 4272 4273 const int32_t TESTSTRINGLEN = 500; 4274 UnicodeString testText; 4275 int32_t numCharClasses; 4276 UVector *chClasses; 4277 int expected[TESTSTRINGLEN*2 + 1]; 4278 int expectedCount = 0; 4279 char expectedBreaks[TESTSTRINGLEN*2 + 1]; 4280 char forwardBreaks[TESTSTRINGLEN*2 + 1]; 4281 char reverseBreaks[TESTSTRINGLEN*2+1]; 4282 char isBoundaryBreaks[TESTSTRINGLEN*2+1]; 4283 char followingBreaks[TESTSTRINGLEN*2+1]; 4284 char precedingBreaks[TESTSTRINGLEN*2+1]; 4285 int i; 4286 int loopCount = 0; 4287 4288 m_seed = seed; 4289 4290 numCharClasses = mk.charClasses()->size(); 4291 chClasses = mk.charClasses(); 4292 4293 // Check for errors that occured during the construction of the MonkeyKind object. 4294 // Can't report them where they occured because errln() is a method coming from intlTest, 4295 // and is not visible outside of RBBITest :-( 4296 if (U_FAILURE(mk.deferredStatus)) { 4297 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus)); 4298 return; 4299 } 4300 4301 // Verify that the character classes all have at least one member. 4302 for (i=0; i<numCharClasses; i++) { 4303 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i); 4304 if (s == NULL || s->size() == 0) { 4305 errln("Character Class #%d is null or of zero size.", i); 4306 return; 4307 } 4308 } 4309 4310 while (loopCount < numIterations || numIterations == -1) { 4311 if (numIterations == -1 && loopCount % 10 == 0) { 4312 // If test is running in an infinite loop, display a periodic tic so 4313 // we can tell that it is making progress. 4314 fprintf(stderr, "."); 4315 } 4316 // Save current random number seed, so that we can recreate the random numbers 4317 // for this loop iteration in event of an error. 4318 seed = m_seed; 4319 4320 // Populate a test string with data. 4321 testText.truncate(0); 4322 for (i=0; i<TESTSTRINGLEN; i++) { 4323 int32_t aClassNum = m_rand() % numCharClasses; 4324 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum); 4325 int32_t charIdx = m_rand() % classSet->size(); 4326 UChar32 c = classSet->charAt(charIdx); 4327 if (c < 0) { // TODO: deal with sets containing strings. 4328 errln("%s:%d c < 0", __FILE__, __LINE__); 4329 break; 4330 } 4331 // Do not assemble a supplementary character from randomly generated separate surrogates. 4332 // (It could be a dictionary character) 4333 if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) { 4334 continue; 4335 } 4336 4337 testText.append(c); 4338 } 4339 4340 // Calculate the expected results for this test string. 4341 mk.setText(testText); 4342 memset(expectedBreaks, 0, sizeof(expectedBreaks)); 4343 expectedBreaks[0] = 1; 4344 int32_t breakPos = 0; 4345 expectedCount = 0; 4346 for (;;) { 4347 breakPos = mk.next(breakPos); 4348 if (breakPos == -1) { 4349 break; 4350 } 4351 if (breakPos > testText.length()) { 4352 errln("breakPos > testText.length()"); 4353 } 4354 expectedBreaks[breakPos] = 1; 4355 U_ASSERT(expectedCount<testText.length()); 4356 expected[expectedCount ++] = breakPos; 4357 (void)expected; // Set but not used warning. 4358 // TODO (andy): check it out. 4359 } 4360 4361 // Find the break positions using forward iteration 4362 memset(forwardBreaks, 0, sizeof(forwardBreaks)); 4363 if (useUText) { 4364 UErrorCode status = U_ZERO_ERROR; 4365 UText *testUText = utext_openReplaceable(NULL, &testText, &status); 4366 // testUText = utext_openUnicodeString(testUText, &testText, &status); 4367 bi->setText(testUText, status); 4368 TEST_ASSERT_SUCCESS(status); 4369 utext_close(testUText); // The break iterator does a shallow clone of the UText 4370 // This UText can be closed immediately, so long as the 4371 // testText string continues to exist. 4372 } else { 4373 bi->setText(testText); 4374 } 4375 4376 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) { 4377 if (i < 0 || i > testText.length()) { 4378 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name); 4379 break; 4380 } 4381 forwardBreaks[i] = 1; 4382 } 4383 4384 // Find the break positions using reverse iteration 4385 memset(reverseBreaks, 0, sizeof(reverseBreaks)); 4386 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) { 4387 if (i < 0 || i > testText.length()) { 4388 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name); 4389 break; 4390 } 4391 reverseBreaks[i] = 1; 4392 } 4393 4394 // Find the break positions using isBoundary() tests. 4395 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks)); 4396 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length()); 4397 for (i=0; i<=testText.length(); i++) { 4398 isBoundaryBreaks[i] = bi->isBoundary(i); 4399 } 4400 4401 4402 // Find the break positions using the following() function. 4403 // printf("."); 4404 memset(followingBreaks, 0, sizeof(followingBreaks)); 4405 int32_t lastBreakPos = 0; 4406 followingBreaks[0] = 1; 4407 for (i=0; i<testText.length(); i++) { 4408 breakPos = bi->following(i); 4409 if (breakPos <= i || 4410 breakPos < lastBreakPos || 4411 breakPos > testText.length() || 4412 (breakPos > lastBreakPos && lastBreakPos > i)) { 4413 errln("%s break monkey test: " 4414 "Out of range value returned by BreakIterator::following().\n" 4415 "Random seed=%d index=%d; following returned %d; lastbreak=%d", 4416 name, seed, i, breakPos, lastBreakPos); 4417 break; 4418 } 4419 followingBreaks[breakPos] = 1; 4420 lastBreakPos = breakPos; 4421 } 4422 4423 // Find the break positions using the preceding() function. 4424 memset(precedingBreaks, 0, sizeof(precedingBreaks)); 4425 lastBreakPos = testText.length(); 4426 precedingBreaks[testText.length()] = 1; 4427 for (i=testText.length(); i>0; i--) { 4428 breakPos = bi->preceding(i); 4429 if (breakPos >= i || 4430 breakPos > lastBreakPos || 4431 (breakPos < 0 && testText.getChar32Start(i)>0) || 4432 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) { 4433 errln("%s break monkey test: " 4434 "Out of range value returned by BreakIterator::preceding().\n" 4435 "index=%d; prev returned %d; lastBreak=%d" , 4436 name, i, breakPos, lastBreakPos); 4437 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) { 4438 precedingBreaks[i] = 2; // Forces an error. 4439 } 4440 } else { 4441 if (breakPos >= 0) { 4442 precedingBreaks[breakPos] = 1; 4443 } 4444 lastBreakPos = breakPos; 4445 } 4446 } 4447 4448 // Compare the expected and actual results. 4449 for (i=0; i<=testText.length(); i++) { 4450 const char *errorType = NULL; 4451 if (forwardBreaks[i] != expectedBreaks[i]) { 4452 errorType = "next()"; 4453 } else if (reverseBreaks[i] != forwardBreaks[i]) { 4454 errorType = "previous()"; 4455 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) { 4456 errorType = "isBoundary()"; 4457 } else if (followingBreaks[i] != expectedBreaks[i]) { 4458 errorType = "following()"; 4459 } else if (precedingBreaks[i] != expectedBreaks[i]) { 4460 errorType = "preceding()"; 4461 } 4462 4463 4464 if (errorType != NULL) { 4465 // Format a range of the test text that includes the failure as 4466 // a data item that can be included in the rbbi test data file. 4467 4468 // Start of the range is the last point where expected and actual results 4469 // both agreed that there was a break position. 4470 int startContext = i; 4471 int32_t count = 0; 4472 for (;;) { 4473 if (startContext==0) { break; } 4474 startContext --; 4475 if (expectedBreaks[startContext] != 0) { 4476 if (count == 2) break; 4477 count ++; 4478 } 4479 } 4480 4481 // End of range is two expected breaks past the start position. 4482 int endContext = i + 1; 4483 int ci; 4484 for (ci=0; ci<2; ci++) { // Number of items to include in error text. 4485 for (;;) { 4486 if (endContext >= testText.length()) {break;} 4487 if (expectedBreaks[endContext-1] != 0) { 4488 if (count == 0) break; 4489 count --; 4490 } 4491 endContext ++; 4492 } 4493 } 4494 4495 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>" 4496 UnicodeString errorText = "<data>"; 4497 /***if (strcmp(errorType, "next()") == 0) { 4498 startContext = 0; 4499 endContext = testText.length(); 4500 4501 printStringBreaks(testText, expected, expectedCount); 4502 }***/ 4503 4504 for (ci=startContext; ci<endContext;) { 4505 UnicodeString hexChars("0123456789abcdef"); 4506 UChar32 c; 4507 int bn; 4508 c = testText.char32At(ci); 4509 if (ci == i) { 4510 // This is the location of the error. 4511 errorText.append("<?>"); 4512 } else if (expectedBreaks[ci] != 0) { 4513 // This a non-error expected break position. 4514 errorText.append("\\"); 4515 } 4516 if (c < 0x10000) { 4517 errorText.append("\\u"); 4518 for (bn=12; bn>=0; bn-=4) { 4519 errorText.append(hexChars.charAt((c>>bn)&0xf)); 4520 } 4521 } else { 4522 errorText.append("\\U"); 4523 for (bn=28; bn>=0; bn-=4) { 4524 errorText.append(hexChars.charAt((c>>bn)&0xf)); 4525 } 4526 } 4527 ci = testText.moveIndex32(ci, 1); 4528 } 4529 errorText.append("\\"); 4530 errorText.append("</data>\n"); 4531 4532 // Output the error 4533 char charErrorTxt[500]; 4534 UErrorCode status = U_ZERO_ERROR; 4535 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status); 4536 charErrorTxt[sizeof(charErrorTxt)-1] = 0; 4537 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status); 4538 4539 errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s", 4540 name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"), 4541 errorType, seed, i, charErrorTxt); 4542 break; 4543 } 4544 } 4545 4546 loopCount++; 4547 } 4548 #endif 4549 } 4550 4551 4552 // Bug 5532. UTF-8 based UText fails in dictionary code. 4553 // This test checks the initial patch, 4554 // which is to just keep it from crashing. Correct word boundaries 4555 // await a proper fix to the dictionary code. 4556 // 4557 void RBBITest::TestBug5532(void) { 4558 // Text includes a mixture of Thai and Latin. 4559 const unsigned char utf8Data[] = { 4560 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u, 4561 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u, 4562 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u, 4563 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 4564 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u, 4565 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u, 4566 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu, 4567 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u, 4568 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 4569 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u, 4570 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00}; 4571 4572 UErrorCode status = U_ZERO_ERROR; 4573 UText utext=UTEXT_INITIALIZER; 4574 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status); 4575 TEST_ASSERT_SUCCESS(status); 4576 4577 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status); 4578 TEST_ASSERT_SUCCESS(status); 4579 if (U_SUCCESS(status)) { 4580 bi->setText(&utext, status); 4581 TEST_ASSERT_SUCCESS(status); 4582 4583 int32_t breakCount = 0; 4584 int32_t previousBreak = -1; 4585 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) { 4586 // For now, just make sure that the break iterator doesn't hang. 4587 TEST_ASSERT(previousBreak < bi->current()); 4588 previousBreak = bi->current(); 4589 } 4590 TEST_ASSERT(breakCount > 0); 4591 } 4592 delete bi; 4593 utext_close(&utext); 4594 } 4595 4596 4597 void RBBITest::TestBug9983(void) { 4598 UnicodeString text = UnicodeString("\\u002A" // * Other 4599 "\\uFF65" // Other 4600 "\\u309C" // Katakana 4601 "\\uFF9F" // Extend 4602 "\\uFF65" // Other 4603 "\\u0020" // Other 4604 "\\u0000").unescape(); 4605 4606 UErrorCode status = U_ZERO_ERROR; 4607 LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>( 4608 BreakIterator::createWordInstance(Locale::getRoot(), status))); 4609 TEST_ASSERT_SUCCESS(status); 4610 LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>( 4611 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status))); 4612 TEST_ASSERT_SUCCESS(status); 4613 if (U_FAILURE(status)) { 4614 return; 4615 } 4616 int32_t offset, rstatus, iterationCount; 4617 4618 brkiter->setText(text); 4619 brkiter->last(); 4620 iterationCount = 0; 4621 while ( (offset = brkiter->previous()) != UBRK_DONE ) { 4622 iterationCount++; 4623 rstatus = brkiter->getRuleStatus(); 4624 (void)rstatus; // Suppress set but not used warning. 4625 if (iterationCount >= 10) { 4626 break; 4627 } 4628 } 4629 TEST_ASSERT(iterationCount == 6); 4630 4631 brkiterPOSIX->setText(text); 4632 brkiterPOSIX->last(); 4633 iterationCount = 0; 4634 while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) { 4635 iterationCount++; 4636 rstatus = brkiterPOSIX->getRuleStatus(); 4637 (void)rstatus; // Suppress set but not used warning. 4638 if (iterationCount >= 10) { 4639 break; 4640 } 4641 } 4642 TEST_ASSERT(iterationCount == 6); 4643 } 4644 4645 // Bug 7547 - verify that building a break itereator from empty rules produces an error. 4646 // 4647 void RBBITest::TestBug7547() { 4648 UnicodeString rules; 4649 UErrorCode status = U_ZERO_ERROR; 4650 UParseError parseError; 4651 RuleBasedBreakIterator breakIterator(rules, parseError, status); 4652 if (status != U_BRK_RULE_SYNTAX) { 4653 errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status)); 4654 } 4655 if (parseError.line != 1 || parseError.offset != 0) { 4656 errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset); 4657 } 4658 } 4659 4660 4661 void RBBITest::TestBug12797() { 4662 UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;"; 4663 UErrorCode status = U_ZERO_ERROR; 4664 UParseError parseError; 4665 RuleBasedBreakIterator bi(rules, parseError, status); 4666 if (U_FAILURE(status)) { 4667 errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status)); 4668 return; 4669 } 4670 UnicodeString text = "abc"; 4671 bi.setText(text); 4672 bi.first(); 4673 int32_t boundary = bi.next(); 4674 if (boundary != 3) { 4675 errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary); 4676 } 4677 } 4678 4679 void RBBITest::TestBug12918() { 4680 // This test triggers an assertion failure in dictbe.cpp 4681 const UChar *crasherString = u"\u3325\u4a16"; 4682 UErrorCode status = U_ZERO_ERROR; 4683 UBreakIterator* iter = ubrk_open(UBRK_WORD, NULL, crasherString, -1, &status); 4684 if (U_FAILURE(status)) { 4685 dataerrln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status)); 4686 return; 4687 } 4688 ubrk_first(iter); 4689 int32_t pos = 0; 4690 int32_t lastPos = -1; 4691 while((pos = ubrk_next(iter)) != UBRK_DONE) { 4692 if (pos <= lastPos) { 4693 errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos); 4694 break; 4695 } 4696 } 4697 ubrk_close(iter); 4698 } 4699 4700 void RBBITest::TestBug12932() { 4701 // Node Stack overflow in the RBBI rule parser caused a seg fault. 4702 UnicodeString ruleStr( 4703 "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((" 4704 "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((" 4705 "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))" 4706 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))" 4707 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))" 4708 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"); 4709 4710 UErrorCode status = U_ZERO_ERROR; 4711 UParseError parseError; 4712 RuleBasedBreakIterator rbbi(ruleStr, parseError, status); 4713 if (status != U_BRK_RULE_SYNTAX) { 4714 errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s", 4715 __FILE__, __LINE__, u_errorName(status)); 4716 } 4717 } 4718 4719 4720 // Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt 4721 // remain undevided by ICU char, word and line break. 4722 void RBBITest::TestEmoji() { 4723 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 4724 UErrorCode status = U_ZERO_ERROR; 4725 4726 CharString testFileName; 4727 testFileName.append(IntlTest::getSourceTestData(status), status); 4728 testFileName.appendPathPart("emoji-test.txt", status); 4729 if (U_FAILURE(status)) { 4730 errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status)); 4731 return; 4732 } 4733 logln("Opening data file %s\n", testFileName.data()); 4734 4735 int len; 4736 UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status); 4737 if (U_FAILURE(status) || testFile == NULL) { 4738 errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status)); 4739 return; 4740 } 4741 UnicodeString testFileAsString(testFile, len); 4742 delete [] testFile; 4743 4744 RegexMatcher lineMatcher(u"^.*?$", testFileAsString, UREGEX_MULTILINE, status); 4745 RegexMatcher hexMatcher(u"\\s*([a-f0-9]*)", UREGEX_CASE_INSENSITIVE, status); 4746 // hexMatcher group(1) is a hex number, or empty string if no hex number present. 4747 int32_t lineNumber = 0; 4748 4749 LocalPointer<BreakIterator> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status); 4750 LocalPointer<BreakIterator> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status), status); 4751 LocalPointer<BreakIterator> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status), status); 4752 if (U_FAILURE(status)) { 4753 dataerrln("%s:%d %s while opening break iterators", __FILE__, __LINE__, u_errorName(status)); 4754 return; 4755 } 4756 4757 while (lineMatcher.find()) { 4758 ++lineNumber; 4759 UnicodeString line = lineMatcher.group(status); 4760 hexMatcher.reset(line); 4761 UnicodeString testString; // accumulates the emoji sequence. 4762 while (hexMatcher.find() && hexMatcher.group(1, status).length() > 0) { 4763 UnicodeString hex = hexMatcher.group(1, status); 4764 if (hex.length() > 8) { 4765 errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__, __LINE__, lineNumber, CStr(hex)()); 4766 break; 4767 } 4768 CharString hex8; 4769 hex8.appendInvariantChars(hex, status); 4770 UChar32 c = (UChar32)strtol(hex8.data(), NULL, 16); 4771 if (c<=0x10ffff) { 4772 testString.append(c); 4773 } else { 4774 errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.", 4775 __FILE__, __LINE__, lineNumber, hex8.data()); 4776 break; 4777 } 4778 } 4779 4780 if (testString.length() > 1) { 4781 charBreaks->setText(testString); 4782 charBreaks->first(); 4783 int32_t firstBreak = charBreaks->next(); 4784 if (testString.length() != firstBreak) { 4785 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d", 4786 __FILE__, __LINE__, lineNumber, firstBreak); 4787 } 4788 wordBreaks->setText(testString); 4789 wordBreaks->first(); 4790 firstBreak = wordBreaks->next(); 4791 if (testString.length() != firstBreak) { 4792 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d", 4793 __FILE__, __LINE__, lineNumber, firstBreak); 4794 } 4795 lineBreaks->setText(testString); 4796 lineBreaks->first(); 4797 firstBreak = lineBreaks->next(); 4798 if (testString.length() != firstBreak) { 4799 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d", 4800 __FILE__, __LINE__, lineNumber, firstBreak); 4801 } 4802 } 4803 } 4804 #endif 4805 } 4806 4807 4808 // TestBug12519 - Correct handling of Locales by assignment / copy / clone 4809 4810 // WHERE Macro yields a literal string of the form "source_file_name:line number " 4811 // TODO: propose something equivalent as a test framework addition. 4812 4813 #define WHERE __FILE__ ":" XLINE(__LINE__) " " 4814 #define XLINE(s) LINE(s) 4815 #define LINE(s) #s 4816 4817 void RBBITest::TestBug12519() { 4818 UErrorCode status = U_ZERO_ERROR; 4819 LocalPointer<RuleBasedBreakIterator> biEn((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status)); 4820 LocalPointer<RuleBasedBreakIterator> biFr((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getFrance(), status)); 4821 if (!assertSuccess(WHERE, status)) { 4822 dataerrln("%s %d status = %s", __FILE__, __LINE__, u_errorName(status)); 4823 return; 4824 } 4825 assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status)); 4826 4827 assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status)); 4828 assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr); 4829 4830 LocalPointer<RuleBasedBreakIterator>cloneEn((RuleBasedBreakIterator *)biEn->clone()); 4831 assertTrue(WHERE, *biEn == *cloneEn); 4832 assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status)); 4833 4834 LocalPointer<RuleBasedBreakIterator>cloneFr((RuleBasedBreakIterator *)biFr->clone()); 4835 assertTrue(WHERE, *biFr == *cloneFr); 4836 assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status)); 4837 4838 LocalPointer<RuleBasedBreakIterator>biDe((RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getGerman(), status)); 4839 UnicodeString text("Hallo Welt"); 4840 biDe->setText(text); 4841 assertTrue(WHERE "before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr != *biDe); 4842 *biDe = *biFr; 4843 assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe); 4844 } 4845 4846 // 4847 // TestDebug - A place-holder test for debugging purposes. 4848 // For putting in fragments of other tests that can be invoked 4849 // for tracing without a lot of unwanted extra stuff happening. 4850 // 4851 void RBBITest::TestDebug(void) { 4852 } 4853 4854 void RBBITest::TestProperties() { 4855 UErrorCode errorCode = U_ZERO_ERROR; 4856 UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode); 4857 if (!prependSet.isEmpty()) { 4858 errln( 4859 "[:GCB=Prepend:] is not empty any more. " 4860 "Uncomment relevant lines in source/data/brkitr/char.txt and " 4861 "change this test to the opposite condition."); 4862 } 4863 } 4864 4865 #endif // #if !UCONFIG_NO_BREAK_ITERATION 4866