1 /******************************************************************** 2 * COPYRIGHT: 3 * Copyright (c) 1999-2015, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ********************************************************************/ 6 /************************************************************************ 7 * Date Name Description 8 * 12/15/99 Madhu Creation. 9 * 01/12/2000 Madhu Updated for changed API and added new tests 10 ************************************************************************/ 11 12 #include "utypeinfo.h" // for 'typeid' to work 13 14 #include "unicode/utypes.h" 15 16 #if !UCONFIG_NO_BREAK_ITERATION 17 18 #include "unicode/utypes.h" 19 #include "unicode/brkiter.h" 20 #include "unicode/rbbi.h" 21 #include "unicode/uchar.h" 22 #include "unicode/utf16.h" 23 #include "unicode/ucnv.h" 24 #include "unicode/schriter.h" 25 #include "unicode/uniset.h" 26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 27 #include "unicode/regex.h" 28 #endif 29 #include "unicode/ustring.h" 30 #include "unicode/utext.h" 31 #include "intltest.h" 32 #include "rbbitst.h" 33 #include <string.h> 34 #include "charstr.h" 35 #include "uvector.h" 36 #include "uvectr32.h" 37 #include <stdio.h> 38 #include <stdlib.h> 39 #include "unicode/numfmt.h" 40 #include "unicode/uscript.h" 41 #include "cmemory.h" 42 43 #define TEST_ASSERT(x) {if (!(x)) { \ 44 errln("Failure in file %s, line %d", __FILE__, __LINE__);}} 45 46 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \ 47 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}} 48 49 50 //--------------------------------------------- 51 // runIndexedTest 52 //--------------------------------------------- 53 54 55 // Note: Before adding new tests to this file, check whether the desired test data can 56 // simply be added to the file testdata/rbbitest.txt. In most cases it can, 57 // it's much less work than writing a new test, diagnostic output in the event of failures 58 // is good, and the test data file will is shared with ICU4J, so eventually the test 59 // will run there as well, without additional effort. 60 61 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params ) 62 { 63 if (exec) logln("TestSuite RuleBasedBreakIterator: "); 64 65 switch (index) { 66 #if !UCONFIG_NO_FILE_IO 67 case 0: name = "TestBug4153072"; 68 if(exec) TestBug4153072(); break; 69 #else 70 case 0: name = "skip"; 71 break; 72 #endif 73 74 case 1: name = "skip"; 75 break; 76 case 2: name = "TestStatusReturn"; 77 if(exec) TestStatusReturn(); break; 78 79 #if !UCONFIG_NO_FILE_IO 80 case 3: name = "TestUnicodeFiles"; 81 if(exec) TestUnicodeFiles(); break; 82 case 4: name = "TestEmptyString"; 83 if(exec) TestEmptyString(); break; 84 #else 85 case 3: case 4: name = "skip"; 86 break; 87 #endif 88 89 case 5: name = "TestGetAvailableLocales"; 90 if(exec) TestGetAvailableLocales(); break; 91 92 case 6: name = "TestGetDisplayName"; 93 if(exec) TestGetDisplayName(); break; 94 95 #if !UCONFIG_NO_FILE_IO 96 case 7: name = "TestEndBehaviour"; 97 if(exec) TestEndBehaviour(); break; 98 case 8: case 9: case 10: name = "skip"; 99 break; 100 case 11: name = "TestWordBreaks"; 101 if(exec) TestWordBreaks(); break; 102 case 12: name = "TestWordBoundary"; 103 if(exec) TestWordBoundary(); break; 104 case 13: name = "TestLineBreaks"; 105 if(exec) TestLineBreaks(); break; 106 case 14: name = "TestSentBreaks"; 107 if(exec) TestSentBreaks(); break; 108 case 15: name = "TestExtended"; 109 if(exec) TestExtended(); break; 110 #else 111 case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip"; 112 break; 113 #endif 114 115 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO 116 case 16: 117 name = "TestMonkey"; if(exec) TestMonkey(params); break; 118 #else 119 case 16: 120 name = "skip"; break; 121 #endif 122 123 #if !UCONFIG_NO_FILE_IO 124 case 17: name = "TestBug3818"; 125 if(exec) TestBug3818(); break; 126 #else 127 case 17: name = "skip"; 128 break; 129 #endif 130 131 case 18: name = "skip"; 132 break; 133 case 19: name = "TestDebug"; 134 if(exec) TestDebug(); break; 135 case 20: name = "skip"; 136 break; 137 138 #if !UCONFIG_NO_FILE_IO 139 case 21: name = "TestBug5775"; 140 if (exec) TestBug5775(); break; 141 #else 142 case 21: name = "skip"; 143 break; 144 #endif 145 146 case 22: name = "TestBug9983"; 147 if (exec) TestBug9983(); break; 148 case 23: name = "TestDictRules"; 149 if (exec) TestDictRules(); break; 150 case 24: name = "TestBug5532"; 151 if (exec) TestBug5532(); break; 152 default: name = ""; break; //needed to end loop 153 } 154 } 155 156 157 //--------------------------------------------------------------------------- 158 // 159 // class BITestData Holds a set of Break iterator test data and results 160 // Includes 161 // - the string data to be broken 162 // - a vector of the expected break positions. 163 // - a vector of source line numbers for the data, 164 // (to help see where errors occured.) 165 // - The expected break tag values. 166 // - Vectors of actual break positions and tag values. 167 // - Functions for comparing actual with expected and 168 // reporting errors. 169 // 170 //---------------------------------------------------------------------------- 171 class BITestData { 172 public: 173 UnicodeString fDataToBreak; 174 UVector fExpectedBreakPositions; 175 UVector fExpectedTags; 176 UVector fLineNum; 177 UVector fActualBreakPositions; // Test Results. 178 UVector fActualTags; 179 180 BITestData(UErrorCode &status); 181 void addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status); 182 void checkResults(const char *heading, RBBITest *test); 183 void err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx); 184 void clearResults(); 185 }; 186 187 // 188 // Constructor. 189 // 190 BITestData::BITestData(UErrorCode &status) 191 : fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fActualBreakPositions(status), 192 fActualTags(status) 193 { 194 } 195 196 // 197 // addDataChunk. Add a section (non-breaking) piece if data to the test data. 198 // The macro form collects the line number, which is helpful 199 // when tracking down failures. 200 // 201 // A null data item is inserted at the start of each test's data 202 // to put the starting zero into the data list. The position saved for 203 // each non-null item is its ending position. 204 // 205 #define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status); 206 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) { 207 if (U_FAILURE(status)) {return;} 208 if (data != NULL) { 209 fDataToBreak.append(CharsToUnicodeString(data)); 210 } 211 fExpectedBreakPositions.addElement(fDataToBreak.length(), status); 212 fExpectedTags.addElement(tag, status); 213 fLineNum.addElement(lineNum, status); 214 } 215 216 217 // 218 // checkResults. Compare the actual and expected break positions, report any differences. 219 // 220 void BITestData::checkResults(const char *heading, RBBITest *test) { 221 int32_t expectedIndex = 0; 222 int32_t actualIndex = 0; 223 224 for (;;) { 225 // If we've run through both the expected and actual results vectors, we're done. 226 // break out of the loop. 227 if (expectedIndex >= fExpectedBreakPositions.size() && 228 actualIndex >= fActualBreakPositions.size()) { 229 break; 230 } 231 232 233 if (expectedIndex >= fExpectedBreakPositions.size()) { 234 err(heading, test, expectedIndex-1, actualIndex); 235 actualIndex++; 236 continue; 237 } 238 239 if (actualIndex >= fActualBreakPositions.size()) { 240 err(heading, test, expectedIndex, actualIndex-1); 241 expectedIndex++; 242 continue; 243 } 244 245 if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) { 246 err(heading, test, expectedIndex, actualIndex); 247 // Try to resync the positions of the indices, to avoid a rash of spurious erros. 248 if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) { 249 actualIndex++; 250 } else { 251 expectedIndex++; 252 } 253 continue; 254 } 255 256 if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) { 257 test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d", 258 heading, fLineNum.elementAt(expectedIndex), 259 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex)); 260 } 261 262 actualIndex++; 263 expectedIndex++; 264 } 265 } 266 267 // 268 // err - An error was found. Report it, along with information about where the 269 // incorrectly broken test data appeared in the source file. 270 // 271 void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx) 272 { 273 int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx); 274 int32_t actual = fActualBreakPositions.elementAti(actualIdx); 275 int32_t o = 0; 276 int32_t line = fLineNum.elementAti(expectedIdx); 277 if (expectedIdx > 0) { 278 // The line numbers are off by one because a premature break occurs somewhere 279 // within the previous item, rather than at the start of the current (expected) item. 280 // We want to report the offset of the unexpected break from the start of 281 // this previous item. 282 o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1); 283 } 284 if (actual < expected) { 285 test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading, o, line, actual, expected); 286 } else { 287 test->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading, line, actual, expected); 288 } 289 } 290 291 292 void BITestData::clearResults() { 293 fActualBreakPositions.removeAllElements(); 294 fActualTags.removeAllElements(); 295 } 296 297 298 //-------------------------------------------------------------------------------------- 299 // 300 // RBBITest constructor and destructor 301 // 302 //-------------------------------------------------------------------------------------- 303 304 RBBITest::RBBITest() { 305 } 306 307 308 RBBITest::~RBBITest() { 309 } 310 311 //----------------------------------------------------------------------------------- 312 // 313 // Test for status {tag} return value from break rules. 314 // TODO: a more thorough test. 315 // 316 //----------------------------------------------------------------------------------- 317 void RBBITest::TestStatusReturn() { 318 UnicodeString rulesString1("$Letters = [:L:];\n" 319 "$Numbers = [:N:];\n" 320 "$Letters+{1};\n" 321 "$Numbers+{2};\n" 322 "Help\\ {4}/me\\!;\n" 323 "[^$Letters $Numbers];\n" 324 "!.*;\n", -1, US_INV); 325 UnicodeString testString1 = "abc123..abc Help me Help me!"; 326 // 01234567890123456789012345678 327 int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1}; 328 int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1}; 329 330 UErrorCode status=U_ZERO_ERROR; 331 UParseError parseError; 332 333 BreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status); 334 if(U_FAILURE(status)) { 335 dataerrln("FAIL : in construction - %s", u_errorName(status)); 336 } else { 337 int32_t pos; 338 int32_t i = 0; 339 bi->setText(testString1); 340 for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) { 341 if (pos != bounds1[i]) { 342 errln("FAIL: expected break at %d, got %d\n", bounds1[i], pos); 343 break; 344 } 345 346 int tag = bi->getRuleStatus(); 347 if (tag != brkStatus[i]) { 348 errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag); 349 break; 350 } 351 i++; 352 } 353 } 354 delete bi; 355 } 356 357 358 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) { 359 UErrorCode status = U_ZERO_ERROR; 360 char name[100]; 361 printf("code alpha extend alphanum type word sent line name\n"); 362 int nextExpectedIndex = 0; 363 utext_setNativeIndex(tstr, 0); 364 for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) { 365 if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) { 366 printf("------------------------------------------------ %d\n", j); 367 ++nextExpectedIndex; 368 } 369 370 UChar32 c = utext_next32(tstr); 371 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status); 372 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c, 373 u_isUAlphabetic(c), 374 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND), 375 u_isalnum(c), 376 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY, 377 u_charType(c), 378 U_SHORT_PROPERTY_NAME), 379 u_getPropertyValueName(UCHAR_WORD_BREAK, 380 u_getIntPropertyValue(c, 381 UCHAR_WORD_BREAK), 382 U_SHORT_PROPERTY_NAME), 383 u_getPropertyValueName(UCHAR_SENTENCE_BREAK, 384 u_getIntPropertyValue(c, 385 UCHAR_SENTENCE_BREAK), 386 U_SHORT_PROPERTY_NAME), 387 u_getPropertyValueName(UCHAR_LINE_BREAK, 388 u_getIntPropertyValue(c, 389 UCHAR_LINE_BREAK), 390 U_SHORT_PROPERTY_NAME), 391 name); 392 } 393 } 394 395 396 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) { 397 UErrorCode status = U_ZERO_ERROR; 398 UText *tstr = NULL; 399 tstr = utext_openConstUnicodeString(NULL, &ustr, &status); 400 if (U_FAILURE(status)) { 401 printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status)); 402 return; 403 } 404 printStringBreaks(tstr, expected, expectedCount); 405 utext_close(tstr); 406 } 407 408 409 void RBBITest::TestBug3818() { 410 UErrorCode status = U_ZERO_ERROR; 411 412 // Four Thai words... 413 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 414 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 }; 415 UnicodeString thaiStr(thaiWordData); 416 417 BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status); 418 if (U_FAILURE(status) || bi == NULL) { 419 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status)); 420 return; 421 } 422 bi->setText(thaiStr); 423 424 int32_t startOfSecondWord = bi->following(1); 425 if (startOfSecondWord != 4) { 426 errln("Fail at file %s, line %d expected start of word at 4, got %d", 427 __FILE__, __LINE__, startOfSecondWord); 428 } 429 startOfSecondWord = bi->following(0); 430 if (startOfSecondWord != 4) { 431 errln("Fail at file %s, line %d expected start of word at 4, got %d", 432 __FILE__, __LINE__, startOfSecondWord); 433 } 434 delete bi; 435 } 436 437 //---------------------------------------------------------------------------- 438 // 439 // generalIteratorTest Given a break iterator and a set of test data, 440 // Run the tests and report the results. 441 // 442 //---------------------------------------------------------------------------- 443 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td) 444 { 445 446 bi.setText(td.fDataToBreak); 447 448 testFirstAndNext(bi, td); 449 450 testLastAndPrevious(bi, td); 451 452 testFollowing(bi, td); 453 testPreceding(bi, td); 454 testIsBoundary(bi, td); 455 doMultipleSelectionTest(bi, td); 456 } 457 458 459 // 460 // testFirstAndNext. Run the iterator forwards in the obvious first(), next() 461 // kind of loop. 462 // 463 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td) 464 { 465 UErrorCode status = U_ZERO_ERROR; 466 int32_t p; 467 int32_t lastP = -1; 468 int32_t tag; 469 470 logln("Test first and next"); 471 bi.setText(td.fDataToBreak); 472 td.clearResults(); 473 474 for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) { 475 td.fActualBreakPositions.addElement(p, status); // Save result. 476 tag = bi.getRuleStatus(); 477 td.fActualTags.addElement(tag, status); 478 if (p <= lastP) { 479 // If the iterator is not making forward progress, stop. 480 // No need to raise an error here, it'll be detected in the normal check of results. 481 break; 482 } 483 lastP = p; 484 } 485 td.checkResults("testFirstAndNext", this); 486 } 487 488 489 // 490 // TestLastAndPrevious. Run the iterator backwards, starting with last(). 491 // 492 void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td) 493 { 494 UErrorCode status = U_ZERO_ERROR; 495 int32_t p; 496 int32_t lastP = 0x7ffffffe; 497 int32_t tag; 498 499 logln("Test last and previous"); 500 bi.setText(td.fDataToBreak); 501 td.clearResults(); 502 503 for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) { 504 // Save break position. Insert it at start of vector of results, shoving 505 // already-saved results further towards the end. 506 td.fActualBreakPositions.insertElementAt(p, 0, status); 507 // bi.previous(); // TODO: Why does this fix things up???? 508 // bi.next(); 509 tag = bi.getRuleStatus(); 510 td.fActualTags.insertElementAt(tag, 0, status); 511 if (p >= lastP) { 512 // If the iterator is not making progress, stop. 513 // No need to raise an error here, it'll be detected in the normal check of results. 514 break; 515 } 516 lastP = p; 517 } 518 td.checkResults("testLastAndPrevious", this); 519 } 520 521 522 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td) 523 { 524 UErrorCode status = U_ZERO_ERROR; 525 int32_t p; 526 int32_t tag; 527 int32_t lastP = -2; // A value that will never be returned as a break position. 528 // cannot be -1; that is returned for DONE. 529 int i; 530 531 logln("testFollowing():"); 532 bi.setText(td.fDataToBreak); 533 td.clearResults(); 534 535 // Save the starting point, since we won't get that out of following. 536 p = bi.first(); 537 td.fActualBreakPositions.addElement(p, status); // Save result. 538 tag = bi.getRuleStatus(); 539 td.fActualTags.addElement(tag, status); 540 541 for (i = 0; i <= td.fDataToBreak.length()+1; i++) { 542 p = bi.following(i); 543 if (p != lastP) { 544 if (p == RuleBasedBreakIterator::DONE) { 545 break; 546 } 547 // We've reached a new break position. Save it. 548 td.fActualBreakPositions.addElement(p, status); // Save result. 549 tag = bi.getRuleStatus(); 550 td.fActualTags.addElement(tag, status); 551 lastP = p; 552 } 553 } 554 // The loop normally exits by means of the break in the middle. 555 // Make sure that the index was at the correct position for the break iterator to have 556 // returned DONE. 557 if (i != td.fDataToBreak.length()) { 558 errln("testFollowing(): iterator returned DONE prematurely."); 559 } 560 561 // Full check of all results. 562 td.checkResults("testFollowing", this); 563 } 564 565 566 567 void RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) { 568 UErrorCode status = U_ZERO_ERROR; 569 int32_t p; 570 int32_t tag; 571 int32_t lastP = 0x7ffffffe; 572 int i; 573 574 logln("testPreceding():"); 575 bi.setText(td.fDataToBreak); 576 td.clearResults(); 577 578 p = bi.last(); 579 td.fActualBreakPositions.addElement(p, status); 580 tag = bi.getRuleStatus(); 581 td.fActualTags.addElement(tag, status); 582 583 for (i = td.fDataToBreak.length(); i>=-1; i--) { 584 p = bi.preceding(i); 585 if (p != lastP) { 586 if (p == RuleBasedBreakIterator::DONE) { 587 break; 588 } 589 // We've reached a new break position. Save it. 590 td.fActualBreakPositions.insertElementAt(p, 0, status); 591 lastP = p; 592 tag = bi.getRuleStatus(); 593 td.fActualTags.insertElementAt(tag, 0, status); 594 } 595 } 596 // The loop normally exits by means of the break in the middle. 597 // Make sure that the index was at the correct position for the break iterator to have 598 // returned DONE. 599 if (i != 0) { 600 errln("testPreceding(): iterator returned DONE prematurely."); 601 } 602 603 // Full check of all results. 604 td.checkResults("testPreceding", this); 605 } 606 607 608 609 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) { 610 UErrorCode status = U_ZERO_ERROR; 611 int i; 612 int32_t tag; 613 614 logln("testIsBoundary():"); 615 bi.setText(td.fDataToBreak); 616 td.clearResults(); 617 618 for (i = 0; i <= td.fDataToBreak.length(); i++) { 619 if (bi.isBoundary(i)) { 620 td.fActualBreakPositions.addElement(i, status); // Save result. 621 tag = bi.getRuleStatus(); 622 td.fActualTags.addElement(tag, status); 623 } 624 } 625 td.checkResults("testIsBoundary: ", this); 626 } 627 628 629 630 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td) 631 { 632 iterator.setText(td.fDataToBreak); 633 634 RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone(); 635 int32_t offset = iterator.first(); 636 int32_t testOffset; 637 int32_t count = 0; 638 639 logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length()); 640 641 if (*testIterator != iterator) 642 errln("clone() or operator!= failed: two clones compared unequal"); 643 644 do { 645 testOffset = testIterator->first(); 646 testOffset = testIterator->next(count); 647 if (offset != testOffset) 648 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset); 649 650 if (offset != RuleBasedBreakIterator::DONE) { 651 count++; 652 offset = iterator.next(); 653 654 if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) { 655 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset); 656 if (count > 10000 || offset == -1) { 657 errln("operator== failed too many times. Stopping test."); 658 if (offset == -1) { 659 errln("Does (RuleBasedBreakIterator::DONE == -1)?"); 660 } 661 return; 662 } 663 } 664 } 665 } while (offset != RuleBasedBreakIterator::DONE); 666 667 // now do it backwards... 668 offset = iterator.last(); 669 count = 0; 670 671 do { 672 testOffset = testIterator->last(); 673 testOffset = testIterator->next(count); // next() with a negative arg is same as previous 674 if (offset != testOffset) 675 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset); 676 677 if (offset != RuleBasedBreakIterator::DONE) { 678 count--; 679 offset = iterator.previous(); 680 } 681 } while (offset != RuleBasedBreakIterator::DONE); 682 683 delete testIterator; 684 } 685 686 687 //--------------------------------------------- 688 // 689 // other tests 690 // 691 //--------------------------------------------- 692 void RBBITest::TestEmptyString() 693 { 694 UnicodeString text = ""; 695 UErrorCode status = U_ZERO_ERROR; 696 697 BITestData x(status); 698 ADD_DATACHUNK(x, "", 0, status); // Break at start of data 699 RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); 700 if (U_FAILURE(status)) 701 { 702 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status)); 703 return; 704 } 705 generalIteratorTest(*bi, x); 706 delete bi; 707 } 708 709 void RBBITest::TestGetAvailableLocales() 710 { 711 int32_t locCount = 0; 712 const Locale* locList = BreakIterator::getAvailableLocales(locCount); 713 714 if (locCount == 0) 715 dataerrln("getAvailableLocales() returned an empty list!"); 716 // Just make sure that it's returning good memory. 717 int32_t i; 718 for (i = 0; i < locCount; ++i) { 719 logln(locList[i].getName()); 720 } 721 } 722 723 //Testing the BreakIterator::getDisplayName() function 724 void RBBITest::TestGetDisplayName() 725 { 726 UnicodeString result; 727 728 BreakIterator::getDisplayName(Locale::getUS(), result); 729 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)") 730 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \"" 731 + result); 732 733 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result); 734 if (result != "French (France)") 735 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \"" 736 + result); 737 } 738 /** 739 * Test End Behaviour 740 * @bug 4068137 741 */ 742 void RBBITest::TestEndBehaviour() 743 { 744 UErrorCode status = U_ZERO_ERROR; 745 UnicodeString testString("boo."); 746 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status); 747 if (U_FAILURE(status)) 748 { 749 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status)); 750 return; 751 } 752 wb->setText(testString); 753 754 if (wb->first() != 0) 755 errln("Didn't get break at beginning of string."); 756 if (wb->next() != 3) 757 errln("Didn't get break before period in \"boo.\""); 758 if (wb->current() != 4 && wb->next() != 4) 759 errln("Didn't get break at end of string."); 760 delete wb; 761 } 762 /* 763 * @bug 4153072 764 */ 765 void RBBITest::TestBug4153072() { 766 UErrorCode status = U_ZERO_ERROR; 767 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status); 768 if (U_FAILURE(status)) 769 { 770 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status)); 771 return; 772 } 773 UnicodeString str("...Hello, World!..."); 774 int32_t begin = 3; 775 int32_t end = str.length() - 3; 776 UBool onBoundary; 777 778 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin); 779 iter->adoptText(textIterator); 780 int index; 781 // Note: with the switch to UText, there is no way to restrict the 782 // iteration range to begin at an index other than zero. 783 // String character iterators created with a non-zero bound are 784 // treated by RBBI as being empty. 785 for (index = -1; index < begin + 1; ++index) { 786 onBoundary = iter->isBoundary(index); 787 if (index == 0? !onBoundary : onBoundary) { 788 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index + 789 " and begin index = " + begin); 790 } 791 } 792 delete iter; 793 } 794 795 796 // 797 // Test for problem reported by Ashok Matoria on 9 July 2007 798 // One.<kSoftHyphen><kSpace>Two. 799 // 800 // Sentence break at start (0) and then on calling next() it breaks at 801 // 'T' of "Two". Now, at this point if I do next() and 802 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two". 803 // 804 void RBBITest::TestBug5775() { 805 UErrorCode status = U_ZERO_ERROR; 806 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status); 807 TEST_ASSERT_SUCCESS(status); 808 if (U_FAILURE(status)) { 809 return; 810 } 811 // Check for status first for better handling of no data errors. 812 TEST_ASSERT(bi != NULL); 813 if (bi == NULL) { 814 return; 815 } 816 817 UnicodeString s("One.\\u00ad Two.", -1, US_INV); 818 // 01234 56789 819 s = s.unescape(); 820 bi->setText(s); 821 int pos = bi->next(); 822 TEST_ASSERT(pos == 6); 823 pos = bi->next(); 824 TEST_ASSERT(pos == 10); 825 pos = bi->previous(); 826 TEST_ASSERT(pos == 6); 827 delete bi; 828 } 829 830 831 832 //------------------------------------------------------------------------------ 833 // 834 // RBBITest::Extended Run RBBI Tests from an external test data file 835 // 836 //------------------------------------------------------------------------------ 837 838 struct TestParams { 839 BreakIterator *bi; // Break iterator is set while parsing test source. 840 // Changed out whenever test data changes break type. 841 842 UnicodeString dataToBreak; // Data that is built up while parsing the test. 843 UVector32 *expectedBreaks; // Expected break positions, matches dataToBreak UnicodeString. 844 UVector32 *srcLine; // Positions in source file, indexed same as dataToBreak. 845 UVector32 *srcCol; 846 847 UText *textToBreak; // UText, could be UTF8 or UTF16. 848 UVector32 *textMap; // Map from UTF-16 dataToBreak offsets to UText offsets. 849 CharString utf8String; // UTF-8 form of text to break. 850 851 TestParams(UErrorCode &status) : dataToBreak() { 852 bi = NULL; 853 expectedBreaks = new UVector32(status); 854 srcLine = new UVector32(status); 855 srcCol = new UVector32(status); 856 textToBreak = NULL; 857 textMap = new UVector32(status); 858 } 859 860 ~TestParams() { 861 delete bi; 862 delete expectedBreaks; 863 delete srcLine; 864 delete srcCol; 865 utext_close(textToBreak); 866 delete textMap; 867 } 868 869 int32_t getSrcLine(int32_t bp); 870 int32_t getExpectedBreak(int32_t bp); 871 int32_t getSrcCol(int32_t bp); 872 873 void setUTF16(UErrorCode &status); 874 void setUTF8(UErrorCode &status); 875 }; 876 877 // Append a UnicodeString to a CharString with UTF-8 encoding. 878 // Substitute any invalid chars. 879 // Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted. 880 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) { 881 if (U_FAILURE(status)) { 882 return; 883 } 884 int32_t utf8Length; 885 u_strToUTF8WithSub(NULL, 0, &utf8Length, // Output Buffer, NULL for preflight. 886 src.getBuffer(), src.length(), // UTF-16 data 887 0xfffd, NULL, // Substitution char, number of subs. 888 &status); 889 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) { 890 return; 891 } 892 status = U_ZERO_ERROR; 893 int32_t capacity; 894 char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status); 895 u_strToUTF8WithSub(buffer, utf8Length, NULL, 896 src.getBuffer(), src.length(), 897 0xfffd, NULL, &status); 898 dest.append(buffer, utf8Length, status); 899 } 900 901 902 void TestParams::setUTF16(UErrorCode &status) { 903 textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status); 904 textMap->removeAllElements(); 905 for (int32_t i=0; i<dataToBreak.length(); i++) { 906 if (i == dataToBreak.getChar32Start(i)) { 907 textMap->addElement(i, status); 908 } else { 909 textMap->addElement(-1, status); 910 } 911 } 912 textMap->addElement(dataToBreak.length(), status); 913 U_ASSERT(dataToBreak.length() + 1 == textMap->size()); 914 } 915 916 917 void TestParams::setUTF8(UErrorCode &status) { 918 if (U_FAILURE(status)) { 919 return; 920 } 921 utf8String.clear(); 922 CharStringAppend(utf8String, dataToBreak, status); 923 textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status); 924 if (U_FAILURE(status)) { 925 return; 926 } 927 928 textMap->removeAllElements(); 929 int32_t utf16Index = 0; 930 for (;;) { 931 textMap->addElement(utf16Index, status); 932 UChar32 c32 = utext_current32(textToBreak); 933 if (c32 < 0) { 934 break; 935 } 936 utf16Index += U16_LENGTH(c32); 937 utext_next32(textToBreak); 938 while (textMap->size() < utext_getNativeIndex(textToBreak)) { 939 textMap->addElement(-1, status); 940 } 941 } 942 U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size()); 943 } 944 945 946 int32_t TestParams::getSrcLine(int bp) { 947 if (bp >= textMap->size()) { 948 bp = textMap->size() - 1; 949 } 950 int32_t i = 0; 951 for(; bp >= 0 ; --bp) { 952 // Move to a character boundary if we are not on one already. 953 i = textMap->elementAti(bp); 954 if (i >= 0) { 955 break; 956 } 957 } 958 return srcLine->elementAti(i); 959 } 960 961 962 int32_t TestParams::getExpectedBreak(int bp) { 963 if (bp >= textMap->size()) { 964 return 0; 965 } 966 int32_t i = textMap->elementAti(bp); 967 int32_t retVal = 0; 968 if (i >= 0) { 969 retVal = expectedBreaks->elementAti(i); 970 } 971 return retVal; 972 } 973 974 975 int32_t TestParams::getSrcCol(int bp) { 976 if (bp >= textMap->size()) { 977 bp = textMap->size() - 1; 978 } 979 int32_t i = 0; 980 for(; bp >= 0; --bp) { 981 // Move bp to a character boundary if we are not on one already. 982 i = textMap->elementAti(bp); 983 if (i >= 0) { 984 break; 985 } 986 } 987 return srcCol->elementAti(i); 988 } 989 990 991 void RBBITest::executeTest(TestParams *t, UErrorCode &status) { 992 int32_t bp; 993 int32_t prevBP; 994 int32_t i; 995 996 TEST_ASSERT_SUCCESS(status); 997 if (U_FAILURE(status)) { 998 return; 999 } 1000 1001 if (t->bi == NULL) { 1002 return; 1003 } 1004 1005 t->bi->setText(t->textToBreak, status); 1006 // 1007 // Run the iterator forward 1008 // 1009 prevBP = -1; 1010 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) { 1011 if (prevBP == bp) { 1012 // Fail for lack of forward progress. 1013 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d", 1014 bp, t->getSrcLine(bp), t->getSrcCol(bp)); 1015 break; 1016 } 1017 1018 // Check that there we didn't miss an expected break between the last one 1019 // and this one. 1020 for (i=prevBP+1; i<bp; i++) { 1021 if (t->getExpectedBreak(i) != 0) { 1022 int expected[] = {0, i}; 1023 printStringBreaks(t->dataToBreak, expected, 2); 1024 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1025 i, t->getSrcLine(i), t->getSrcCol(i)); 1026 } 1027 } 1028 1029 // Check that the break we did find was expected 1030 if (t->getExpectedBreak(bp) == 0) { 1031 int expected[] = {0, bp}; 1032 printStringBreaks(t->textToBreak, expected, 2); 1033 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d", 1034 bp, t->getSrcLine(bp), t->getSrcCol(bp)); 1035 } else { 1036 // The break was expected. 1037 // Check that the {nnn} tag value is correct. 1038 int32_t expectedTagVal = t->getExpectedBreak(bp); 1039 if (expectedTagVal == -1) { 1040 expectedTagVal = 0; 1041 } 1042 int32_t line = t->getSrcLine(bp); 1043 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus(); 1044 if (rs != expectedTagVal) { 1045 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n" 1046 " Actual, Expected status = %4d, %4d", 1047 bp, line, t->getSrcCol(bp), rs, expectedTagVal); 1048 } 1049 } 1050 1051 prevBP = bp; 1052 } 1053 1054 // Verify that there were no missed expected breaks after the last one found 1055 for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) { 1056 if (t->getExpectedBreak(i) != 0) { 1057 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1058 i, t->getSrcLine(i), t->getSrcCol(i)); 1059 } 1060 } 1061 1062 // 1063 // Run the iterator backwards, verify that the same breaks are found. 1064 // 1065 prevBP = utext_nativeLength(t->textToBreak)+2; // start with a phony value for the last break pos seen. 1066 for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) { 1067 if (prevBP == bp) { 1068 // Fail for lack of progress. 1069 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d", 1070 bp, t->getSrcLine(bp), t->getSrcCol(bp)); 1071 break; 1072 } 1073 1074 // Check that we didn't miss an expected break between the last one 1075 // and this one. (UVector returns zeros for index out of bounds.) 1076 for (i=prevBP-1; i>bp; i--) { 1077 if (t->getExpectedBreak(i) != 0) { 1078 errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1079 i, t->getSrcLine(i), t->getSrcCol(i)); 1080 } 1081 } 1082 1083 // Check that the break we did find was expected 1084 if (t->getExpectedBreak(bp) == 0) { 1085 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d", 1086 bp, t->getSrcLine(bp), t->getSrcCol(bp)); 1087 } else { 1088 // The break was expected. 1089 // Check that the {nnn} tag value is correct. 1090 int32_t expectedTagVal = t->getExpectedBreak(bp); 1091 if (expectedTagVal == -1) { 1092 expectedTagVal = 0; 1093 } 1094 int line = t->getSrcLine(bp); 1095 int32_t rs = t->bi->getRuleStatus(); 1096 if (rs != expectedTagVal) { 1097 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n" 1098 " Actual, Expected status = %4d, %4d", 1099 bp, line, t->getSrcCol(bp), rs, expectedTagVal); 1100 } 1101 } 1102 1103 prevBP = bp; 1104 } 1105 1106 // Verify that there were no missed breaks prior to the last one found 1107 for (i=prevBP-1; i>=0; i--) { 1108 if (t->getExpectedBreak(i) != 0) { 1109 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1110 i, t->getSrcLine(i), t->getSrcCol(i)); 1111 } 1112 } 1113 1114 // Check isBoundary() 1115 for (i=0; i < utext_nativeLength(t->textToBreak); i++) { 1116 UBool boundaryExpected = (t->getExpectedBreak(i) != 0); 1117 UBool boundaryFound = t->bi->isBoundary(i); 1118 if (boundaryExpected != boundaryFound) { 1119 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n" 1120 " Expected, Actual= %s, %s", 1121 i, t->getSrcLine(i), t->getSrcCol(i), 1122 boundaryExpected ? "true":"false", boundaryFound? "true" : "false"); 1123 } 1124 } 1125 1126 // Check following() 1127 for (i=0; i < utext_nativeLength(t->textToBreak); i++) { 1128 int32_t actualBreak = t->bi->following(i); 1129 int32_t expectedBreak = BreakIterator::DONE; 1130 for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) { 1131 if (t->getExpectedBreak(j) != 0) { 1132 expectedBreak = j; 1133 break; 1134 } 1135 } 1136 if (expectedBreak != actualBreak) { 1137 errln("following(%d) incorrect. File line,col= %4d,%4d\n" 1138 " Expected, Actual= %d, %d", 1139 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak); 1140 } 1141 } 1142 1143 // Check preceding() 1144 for (i=utext_nativeLength(t->textToBreak); i>=0; i--) { 1145 int32_t actualBreak = t->bi->preceding(i); 1146 int32_t expectedBreak = BreakIterator::DONE; 1147 1148 // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent. 1149 // preceding(trailing byte) will return the index of some preceding code point, 1150 // not the lead byte of the current code point, even though that has a smaller index. 1151 // Therefore, start looking at the expected break data not at i-1, but at 1152 // the start of code point index - 1. 1153 utext_setNativeIndex(t->textToBreak, i); 1154 int32_t j = utext_getNativeIndex(t->textToBreak) - 1; 1155 for (; j >= 0; j--) { 1156 if (t->getExpectedBreak(j) != 0) { 1157 expectedBreak = j; 1158 break; 1159 } 1160 } 1161 if (expectedBreak != actualBreak) { 1162 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n" 1163 " Expected, Actual= %d, %d", 1164 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak); 1165 } 1166 } 1167 } 1168 1169 1170 void RBBITest::TestExtended() { 1171 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 1172 UErrorCode status = U_ZERO_ERROR; 1173 Locale locale(""); 1174 1175 UnicodeString rules; 1176 TestParams tp(status); 1177 1178 RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@=-]*) *>"), 0, status); 1179 if (U_FAILURE(status)) { 1180 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status)); 1181 } 1182 1183 1184 // 1185 // Open and read the test data file. 1186 // 1187 const char *testDataDirectory = IntlTest::getSourceTestData(status); 1188 char testFileName[1000]; 1189 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) { 1190 errln("Can't open test data. Path too long."); 1191 return; 1192 } 1193 strcpy(testFileName, testDataDirectory); 1194 strcat(testFileName, "rbbitst.txt"); 1195 1196 int len; 1197 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status); 1198 if (U_FAILURE(status)) { 1199 return; /* something went wrong, error already output */ 1200 } 1201 1202 1203 1204 1205 // 1206 // Put the test data into a UnicodeString 1207 // 1208 UnicodeString testString(FALSE, testFile, len); 1209 1210 enum EParseState{ 1211 PARSE_COMMENT, 1212 PARSE_TAG, 1213 PARSE_DATA, 1214 PARSE_NUM 1215 } 1216 parseState = PARSE_TAG; 1217 1218 EParseState savedState = PARSE_TAG; 1219 1220 static const UChar CH_LF = 0x0a; 1221 static const UChar CH_CR = 0x0d; 1222 static const UChar CH_HASH = 0x23; 1223 /*static const UChar CH_PERIOD = 0x2e;*/ 1224 static const UChar CH_LT = 0x3c; 1225 static const UChar CH_GT = 0x3e; 1226 static const UChar CH_BACKSLASH = 0x5c; 1227 static const UChar CH_BULLET = 0x2022; 1228 1229 int32_t lineNum = 1; 1230 int32_t colStart = 0; 1231 int32_t column = 0; 1232 int32_t charIdx = 0; 1233 1234 int32_t tagValue = 0; // The numeric value of a <nnn> tag. 1235 1236 for (charIdx = 0; charIdx < len; ) { 1237 status = U_ZERO_ERROR; 1238 UChar c = testString.charAt(charIdx); 1239 charIdx++; 1240 if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) { 1241 // treat CRLF as a unit 1242 c = CH_LF; 1243 charIdx++; 1244 } 1245 if (c == CH_LF || c == CH_CR) { 1246 lineNum++; 1247 colStart = charIdx; 1248 } 1249 column = charIdx - colStart + 1; 1250 1251 switch (parseState) { 1252 case PARSE_COMMENT: 1253 if (c == 0x0a || c == 0x0d) { 1254 parseState = savedState; 1255 } 1256 break; 1257 1258 case PARSE_TAG: 1259 { 1260 if (c == CH_HASH) { 1261 parseState = PARSE_COMMENT; 1262 savedState = PARSE_TAG; 1263 break; 1264 } 1265 if (u_isUWhiteSpace(c)) { 1266 break; 1267 } 1268 if (testString.compare(charIdx-1, 6, "<word>") == 0) { 1269 delete tp.bi; 1270 tp.bi = BreakIterator::createWordInstance(locale, status); 1271 charIdx += 5; 1272 break; 1273 } 1274 if (testString.compare(charIdx-1, 6, "<char>") == 0) { 1275 delete tp.bi; 1276 tp.bi = BreakIterator::createCharacterInstance(locale, status); 1277 charIdx += 5; 1278 break; 1279 } 1280 if (testString.compare(charIdx-1, 6, "<line>") == 0) { 1281 delete tp.bi; 1282 tp.bi = BreakIterator::createLineInstance(locale, status); 1283 charIdx += 5; 1284 break; 1285 } 1286 if (testString.compare(charIdx-1, 6, "<sent>") == 0) { 1287 delete tp.bi; 1288 tp.bi = NULL; 1289 tp.bi = BreakIterator::createSentenceInstance(locale, status); 1290 charIdx += 5; 1291 break; 1292 } 1293 if (testString.compare(charIdx-1, 7, "<title>") == 0) { 1294 delete tp.bi; 1295 tp.bi = BreakIterator::createTitleInstance(locale, status); 1296 charIdx += 6; 1297 break; 1298 } 1299 1300 // <locale loc_name> 1301 localeMatcher.reset(testString); 1302 if (localeMatcher.lookingAt(charIdx-1, status)) { 1303 UnicodeString localeName = localeMatcher.group(1, status); 1304 char localeName8[100]; 1305 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0); 1306 locale = Locale::createFromName(localeName8); 1307 charIdx += localeMatcher.group(0, status).length() - 1; 1308 TEST_ASSERT_SUCCESS(status); 1309 break; 1310 } 1311 if (testString.compare(charIdx-1, 6, "<data>") == 0) { 1312 parseState = PARSE_DATA; 1313 charIdx += 5; 1314 tp.dataToBreak = ""; 1315 tp.expectedBreaks->removeAllElements(); 1316 tp.srcCol ->removeAllElements(); 1317 tp.srcLine->removeAllElements(); 1318 break; 1319 } 1320 1321 errln("line %d: Tag expected in test file.", lineNum); 1322 parseState = PARSE_COMMENT; 1323 savedState = PARSE_DATA; 1324 goto end_test; // Stop the test. 1325 } 1326 break; 1327 1328 case PARSE_DATA: 1329 if (c == CH_BULLET) { 1330 int32_t breakIdx = tp.dataToBreak.length(); 1331 tp.expectedBreaks->setSize(breakIdx+1); 1332 tp.expectedBreaks->setElementAt(-1, breakIdx); 1333 tp.srcLine->setSize(breakIdx+1); 1334 tp.srcLine->setElementAt(lineNum, breakIdx); 1335 tp.srcCol ->setSize(breakIdx+1); 1336 tp.srcCol ->setElementAt(column, breakIdx); 1337 break; 1338 } 1339 1340 if (testString.compare(charIdx-1, 7, "</data>") == 0) { 1341 // Add final entry to mappings from break location to source file position. 1342 // Need one extra because last break position returned is after the 1343 // last char in the data, not at the last char. 1344 tp.srcLine->addElement(lineNum, status); 1345 tp.srcCol ->addElement(column, status); 1346 1347 parseState = PARSE_TAG; 1348 charIdx += 6; 1349 1350 // RUN THE TEST! 1351 status = U_ZERO_ERROR; 1352 tp.setUTF16(status); 1353 executeTest(&tp, status); 1354 TEST_ASSERT_SUCCESS(status); 1355 1356 // Run again, this time with UTF-8 text wrapped in a UText. 1357 status = U_ZERO_ERROR; 1358 tp.setUTF8(status); 1359 TEST_ASSERT_SUCCESS(status); 1360 executeTest(&tp, status); 1361 break; 1362 } 1363 1364 if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) { 1365 // Named character, e.g. \N{COMBINING GRAVE ACCENT} 1366 // Get the code point from the name and insert it into the test data. 1367 // (Damn, no API takes names in Unicode !!! 1368 // we've got to take it back to char *) 1369 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx); 1370 int32_t nameLength = nameEndIdx - (charIdx+2); 1371 char charNameBuf[200]; 1372 UChar32 theChar = -1; 1373 if (nameEndIdx != -1) { 1374 UErrorCode status = U_ZERO_ERROR; 1375 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf)); 1376 charNameBuf[sizeof(charNameBuf)-1] = 0; 1377 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status); 1378 if (U_FAILURE(status)) { 1379 theChar = -1; 1380 } 1381 } 1382 if (theChar == -1) { 1383 errln("Error in named character in test file at line %d, col %d", 1384 lineNum, column); 1385 } else { 1386 // Named code point was recognized. Insert it 1387 // into the test data. 1388 tp.dataToBreak.append(theChar); 1389 while (tp.dataToBreak.length() > tp.srcLine->size()) { 1390 tp.srcLine->addElement(lineNum, status); 1391 tp.srcCol ->addElement(column, status); 1392 } 1393 } 1394 if (nameEndIdx > charIdx) { 1395 charIdx = nameEndIdx+1; 1396 1397 } 1398 break; 1399 } 1400 1401 1402 1403 1404 if (testString.compare(charIdx-1, 2, "<>") == 0) { 1405 charIdx++; 1406 int32_t breakIdx = tp.dataToBreak.length(); 1407 tp.expectedBreaks->setSize(breakIdx+1); 1408 tp.expectedBreaks->setElementAt(-1, breakIdx); 1409 tp.srcLine->setSize(breakIdx+1); 1410 tp.srcLine->setElementAt(lineNum, breakIdx); 1411 tp.srcCol ->setSize(breakIdx+1); 1412 tp.srcCol ->setElementAt(column, breakIdx); 1413 break; 1414 } 1415 1416 if (c == CH_LT) { 1417 tagValue = 0; 1418 parseState = PARSE_NUM; 1419 break; 1420 } 1421 1422 if (c == CH_HASH && column==3) { // TODO: why is column off so far? 1423 parseState = PARSE_COMMENT; 1424 savedState = PARSE_DATA; 1425 break; 1426 } 1427 1428 if (c == CH_BACKSLASH) { 1429 // Check for \ at end of line, a line continuation. 1430 // Advance over (discard) the newline 1431 UChar32 cp = testString.char32At(charIdx); 1432 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) { 1433 // We have a CR LF 1434 // Need an extra increment of the input ptr to move over both of them 1435 charIdx++; 1436 } 1437 if (cp == CH_LF || cp == CH_CR) { 1438 lineNum++; 1439 colStart = charIdx; 1440 charIdx++; 1441 break; 1442 } 1443 1444 // Let unescape handle the back slash. 1445 cp = testString.unescapeAt(charIdx); 1446 if (cp != -1) { 1447 // Escape sequence was recognized. Insert the char 1448 // into the test data. 1449 tp.dataToBreak.append(cp); 1450 while (tp.dataToBreak.length() > tp.srcLine->size()) { 1451 tp.srcLine->addElement(lineNum, status); 1452 tp.srcCol ->addElement(column, status); 1453 } 1454 break; 1455 } 1456 1457 1458 // Not a recognized backslash escape sequence. 1459 // Take the next char as a literal. 1460 // TODO: Should this be an error? 1461 c = testString.charAt(charIdx); 1462 charIdx = testString.moveIndex32(charIdx, 1); 1463 } 1464 1465 // Normal, non-escaped data char. 1466 tp.dataToBreak.append(c); 1467 1468 // Save the mapping from offset in the data to line/column numbers in 1469 // the original input file. Will be used for better error messages only. 1470 // If there's an expected break before this char, the slot in the mapping 1471 // vector will already be set for this char; don't overwrite it. 1472 if (tp.dataToBreak.length() > tp.srcLine->size()) { 1473 tp.srcLine->addElement(lineNum, status); 1474 tp.srcCol ->addElement(column, status); 1475 } 1476 break; 1477 1478 1479 case PARSE_NUM: 1480 // We are parsing an expected numeric tag value, like <1234>, 1481 // within a chunk of data. 1482 if (u_isUWhiteSpace(c)) { 1483 break; 1484 } 1485 1486 if (c == CH_GT) { 1487 // Finished the number. Add the info to the expected break data, 1488 // and switch parse state back to doing plain data. 1489 parseState = PARSE_DATA; 1490 if (tagValue == 0) { 1491 tagValue = -1; 1492 } 1493 int32_t breakIdx = tp.dataToBreak.length(); 1494 tp.expectedBreaks->setSize(breakIdx+1); 1495 tp.expectedBreaks->setElementAt(tagValue, breakIdx); 1496 tp.srcLine->setSize(breakIdx+1); 1497 tp.srcLine->setElementAt(lineNum, breakIdx); 1498 tp.srcCol ->setSize(breakIdx+1); 1499 tp.srcCol ->setElementAt(column, breakIdx); 1500 break; 1501 } 1502 1503 if (u_isdigit(c)) { 1504 tagValue = tagValue*10 + u_charDigitValue(c); 1505 break; 1506 } 1507 1508 errln("Syntax Error in test file at line %d, col %d", 1509 lineNum, column); 1510 parseState = PARSE_COMMENT; 1511 goto end_test; // Stop the test 1512 break; 1513 } 1514 1515 1516 if (U_FAILURE(status)) { 1517 dataerrln("ICU Error %s while parsing test file at line %d.", 1518 u_errorName(status), lineNum); 1519 status = U_ZERO_ERROR; 1520 goto end_test; // Stop the test 1521 } 1522 1523 } 1524 1525 end_test: 1526 delete [] testFile; 1527 #endif 1528 } 1529 1530 1531 //------------------------------------------------------------------------------- 1532 // 1533 // TestDictRules create a break iterator from source rules that includes a 1534 // dictionary range. Regression for bug #7130. Source rules 1535 // do not declare a break iterator type (word, line, sentence, etc. 1536 // but the dictionary code, without a type, would loop. 1537 // 1538 //------------------------------------------------------------------------------- 1539 void RBBITest::TestDictRules() { 1540 const char *rules = "$dictionary = [a-z]; \n" 1541 "!!forward; \n" 1542 "$dictionary $dictionary; \n" 1543 "!!reverse; \n" 1544 "$dictionary $dictionary; \n"; 1545 const char *text = "aa"; 1546 UErrorCode status = U_ZERO_ERROR; 1547 UParseError parseError; 1548 1549 RuleBasedBreakIterator bi(rules, parseError, status); 1550 if (U_SUCCESS(status)) { 1551 UnicodeString utext = text; 1552 bi.setText(utext); 1553 int32_t position; 1554 int32_t loops; 1555 for (loops = 0; loops<10; loops++) { 1556 position = bi.next(); 1557 if (position == RuleBasedBreakIterator::DONE) { 1558 break; 1559 } 1560 } 1561 TEST_ASSERT(loops == 1); 1562 } else { 1563 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status)); 1564 } 1565 } 1566 1567 1568 1569 //------------------------------------------------------------------------------- 1570 // 1571 // ReadAndConvertFile Read a text data file, convert it to UChars, and 1572 // return the datain one big UChar * buffer, which the caller must delete. 1573 // 1574 // parameters: 1575 // fileName: the name of the file, with no directory part. The test data directory 1576 // is assumed. 1577 // ulen an out parameter, receives the actual length (in UChars) of the file data. 1578 // encoding The file encoding. If the file contains a BOM, that will override the encoding 1579 // specified here. The BOM, if it exists, will be stripped from the returned data. 1580 // Pass NULL for the system default encoding. 1581 // status 1582 // returns: 1583 // The file data, converted to UChar. 1584 // The caller must delete this when done with 1585 // delete [] theBuffer; 1586 // 1587 // TODO: This is a clone of RegexTest::ReadAndConvertFile. 1588 // Move this function to some common place. 1589 // 1590 //-------------------------------------------------------------------------------- 1591 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) { 1592 UChar *retPtr = NULL; 1593 char *fileBuf = NULL; 1594 UConverter* conv = NULL; 1595 FILE *f = NULL; 1596 1597 ulen = 0; 1598 if (U_FAILURE(status)) { 1599 return retPtr; 1600 } 1601 1602 // 1603 // Open the file. 1604 // 1605 f = fopen(fileName, "rb"); 1606 if (f == 0) { 1607 dataerrln("Error opening test data file %s\n", fileName); 1608 status = U_FILE_ACCESS_ERROR; 1609 return NULL; 1610 } 1611 // 1612 // Read it in 1613 // 1614 int fileSize; 1615 int amt_read; 1616 1617 fseek( f, 0, SEEK_END); 1618 fileSize = ftell(f); 1619 fileBuf = new char[fileSize]; 1620 fseek(f, 0, SEEK_SET); 1621 amt_read = fread(fileBuf, 1, fileSize, f); 1622 if (amt_read != fileSize || fileSize <= 0) { 1623 errln("Error reading test data file."); 1624 goto cleanUpAndReturn; 1625 } 1626 1627 // 1628 // Look for a Unicode Signature (BOM) on the data just read 1629 // 1630 int32_t signatureLength; 1631 const char * fileBufC; 1632 const char* bomEncoding; 1633 1634 fileBufC = fileBuf; 1635 bomEncoding = ucnv_detectUnicodeSignature( 1636 fileBuf, fileSize, &signatureLength, &status); 1637 if(bomEncoding!=NULL ){ 1638 fileBufC += signatureLength; 1639 fileSize -= signatureLength; 1640 encoding = bomEncoding; 1641 } 1642 1643 // 1644 // Open a converter to take the rule file to UTF-16 1645 // 1646 conv = ucnv_open(encoding, &status); 1647 if (U_FAILURE(status)) { 1648 goto cleanUpAndReturn; 1649 } 1650 1651 // 1652 // Convert the rules to UChar. 1653 // Preflight first to determine required buffer size. 1654 // 1655 ulen = ucnv_toUChars(conv, 1656 NULL, // dest, 1657 0, // destCapacity, 1658 fileBufC, 1659 fileSize, 1660 &status); 1661 if (status == U_BUFFER_OVERFLOW_ERROR) { 1662 // Buffer Overflow is expected from the preflight operation. 1663 status = U_ZERO_ERROR; 1664 1665 retPtr = new UChar[ulen+1]; 1666 ucnv_toUChars(conv, 1667 retPtr, // dest, 1668 ulen+1, 1669 fileBufC, 1670 fileSize, 1671 &status); 1672 } 1673 1674 cleanUpAndReturn: 1675 fclose(f); 1676 delete []fileBuf; 1677 ucnv_close(conv); 1678 if (U_FAILURE(status)) { 1679 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 1680 delete []retPtr; 1681 retPtr = 0; 1682 ulen = 0; 1683 }; 1684 return retPtr; 1685 } 1686 1687 1688 1689 //-------------------------------------------------------------------------------------------- 1690 // 1691 // Run tests from each of the boundary test data files distributed by the Unicode Consortium 1692 // 1693 //------------------------------------------------------------------------------------------- 1694 void RBBITest::TestUnicodeFiles() { 1695 RuleBasedBreakIterator *bi; 1696 UErrorCode status = U_ZERO_ERROR; 1697 1698 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status); 1699 TEST_ASSERT_SUCCESS(status); 1700 if (U_SUCCESS(status)) { 1701 runUnicodeTestData("GraphemeBreakTest.txt", bi); 1702 } 1703 delete bi; 1704 1705 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status); 1706 TEST_ASSERT_SUCCESS(status); 1707 if (U_SUCCESS(status)) { 1708 runUnicodeTestData("WordBreakTest.txt", bi); 1709 } 1710 delete bi; 1711 1712 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status); 1713 TEST_ASSERT_SUCCESS(status); 1714 if (U_SUCCESS(status)) { 1715 runUnicodeTestData("SentenceBreakTest.txt", bi); 1716 } 1717 delete bi; 1718 1719 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status); 1720 TEST_ASSERT_SUCCESS(status); 1721 if (U_SUCCESS(status)) { 1722 runUnicodeTestData("LineBreakTest.txt", bi); 1723 } 1724 delete bi; 1725 } 1726 1727 1728 // Check for test cases from the Unicode test data files that are known to fail 1729 // and should be skipped because ICU is not yet able to fully implement the spec. 1730 // See ticket #7270. 1731 1732 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) { 1733 static const UChar badTestCases[][4] = { // Line Numbers from Unicode 7.0.0 file. 1734 {(UChar)0x200B, (UChar)0x0020, (UChar)0x007D, (UChar)0x0000}, // Line 5198 1735 {(UChar)0x200B, (UChar)0x0020, (UChar)0x0029, (UChar)0x0000}, // Line 5202 1736 {(UChar)0x200B, (UChar)0x0020, (UChar)0x0021, (UChar)0x0000}, // Line 5214 1737 {(UChar)0x200B, (UChar)0x0020, (UChar)0x002c, (UChar)0x0000}, // Line 5246 1738 {(UChar)0x200B, (UChar)0x0020, (UChar)0x002f, (UChar)0x0000}, // Line 5298 1739 {(UChar)0x200B, (UChar)0x0020, (UChar)0x2060, (UChar)0x0000} // Line 5302 1740 }; 1741 if (strcmp(fileName, "LineBreakTest.txt") != 0) { 1742 return FALSE; 1743 } 1744 1745 for (int i=0; i<UPRV_LENGTHOF(badTestCases); i++) { 1746 if (testCase == UnicodeString(badTestCases[i])) { 1747 return logKnownIssue("7270"); 1748 } 1749 } 1750 return FALSE; 1751 } 1752 1753 1754 //-------------------------------------------------------------------------------------------- 1755 // 1756 // Run tests from one of the boundary test data files distributed by the Unicode Consortium 1757 // 1758 //------------------------------------------------------------------------------------------- 1759 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) { 1760 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 1761 UErrorCode status = U_ZERO_ERROR; 1762 1763 // 1764 // Open and read the test data file, put it into a UnicodeString. 1765 // 1766 const char *testDataDirectory = IntlTest::getSourceTestData(status); 1767 char testFileName[1000]; 1768 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) { 1769 dataerrln("Can't open test data. Path too long."); 1770 return; 1771 } 1772 strcpy(testFileName, testDataDirectory); 1773 strcat(testFileName, fileName); 1774 1775 logln("Opening data file %s\n", fileName); 1776 1777 int len; 1778 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status); 1779 if (status != U_FILE_ACCESS_ERROR) { 1780 TEST_ASSERT_SUCCESS(status); 1781 TEST_ASSERT(testFile != NULL); 1782 } 1783 if (U_FAILURE(status) || testFile == NULL) { 1784 return; /* something went wrong, error already output */ 1785 } 1786 UnicodeString testFileAsString(TRUE, testFile, len); 1787 1788 // 1789 // Parse the test data file using a regular expression. 1790 // Each kind of token is recognized in its own capture group; what type of item was scanned 1791 // is identified by which group had a match. 1792 // 1793 // Caputure Group # 1 2 3 4 5 1794 // Parses this item: divide x hex digits comment \n unrecognized \n 1795 // 1796 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV); 1797 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status); 1798 UnicodeString testString; 1799 UVector32 breakPositions(status); 1800 int lineNumber = 1; 1801 TEST_ASSERT_SUCCESS(status); 1802 if (U_FAILURE(status)) { 1803 return; 1804 } 1805 1806 // 1807 // Scan through each test case, building up the string to be broken in testString, 1808 // and the positions that should be boundaries in the breakPositions vector. 1809 // 1810 int spin = 0; 1811 while (tokenMatcher.find()) { 1812 if(tokenMatcher.hitEnd()) { 1813 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for. 1814 This occurred when the text file was corrupt (wasn't marked as UTF-8) 1815 and caused an infinite loop here on EBCDIC systems! 1816 */ 1817 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin); 1818 // return; 1819 } 1820 if (tokenMatcher.start(1, status) >= 0) { 1821 // Scanned a divide sign, indicating a break position in the test data. 1822 if (testString.length()>0) { 1823 breakPositions.addElement(testString.length(), status); 1824 } 1825 } 1826 else if (tokenMatcher.start(2, status) >= 0) { 1827 // Scanned an 'x', meaning no break at this position in the test data 1828 // Nothing to be done here. 1829 } 1830 else if (tokenMatcher.start(3, status) >= 0) { 1831 // Scanned Hex digits. Convert them to binary, append to the character data string. 1832 const UnicodeString &hexNumber = tokenMatcher.group(3, status); 1833 int length = hexNumber.length(); 1834 if (length<=8) { 1835 char buf[10]; 1836 hexNumber.extract (0, length, buf, sizeof(buf), US_INV); 1837 UChar32 c = (UChar32)strtol(buf, NULL, 16); 1838 if (c<=0x10ffff) { 1839 testString.append(c); 1840 } else { 1841 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n", 1842 fileName, lineNumber); 1843 } 1844 } else { 1845 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n", 1846 fileName, lineNumber); 1847 } 1848 } 1849 else if (tokenMatcher.start(4, status) >= 0) { 1850 // Scanned to end of a line, possibly skipping over a comment in the process. 1851 // If the line from the file contained test data, run the test now. 1852 if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) { 1853 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi); 1854 } 1855 1856 // Clear out this test case. 1857 // The string and breakPositions vector will be refilled as the next 1858 // test case is parsed. 1859 testString.remove(); 1860 breakPositions.removeAllElements(); 1861 lineNumber++; 1862 } else { 1863 // Scanner catchall. Something unrecognized appeared on the line. 1864 char token[16]; 1865 UnicodeString uToken = tokenMatcher.group(0, status); 1866 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token)); 1867 token[sizeof(token)-1] = 0; 1868 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token); 1869 1870 // Clean up, in preparation for continuing with the next line. 1871 testString.remove(); 1872 breakPositions.removeAllElements(); 1873 lineNumber++; 1874 } 1875 TEST_ASSERT_SUCCESS(status); 1876 if (U_FAILURE(status)) { 1877 break; 1878 } 1879 } 1880 1881 delete [] testFile; 1882 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 1883 } 1884 1885 //-------------------------------------------------------------------------------------------- 1886 // 1887 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium 1888 // test data files. Do only a simple, forward-only check - 1889 // this test is mostly to check that ICU and the Unicode 1890 // data agree with each other. 1891 // 1892 //-------------------------------------------------------------------------------------------- 1893 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber, 1894 const UnicodeString &testString, // Text data to be broken 1895 UVector32 *breakPositions, // Positions where breaks should be found. 1896 RuleBasedBreakIterator *bi) { 1897 int32_t pos; // Break Position in the test string 1898 int32_t expectedI = 0; // Index of expected break position in the vector of expected results. 1899 int32_t expectedPos; // Expected break position (index into test string) 1900 1901 bi->setText(testString); 1902 pos = bi->first(); 1903 pos = bi->next(); 1904 1905 while (pos != BreakIterator::DONE) { 1906 if (expectedI >= breakPositions->size()) { 1907 errln("Test file \"%s\", line %d, unexpected break found at position %d", 1908 testFileName, lineNumber, pos); 1909 break; 1910 } 1911 expectedPos = breakPositions->elementAti(expectedI); 1912 if (pos < expectedPos) { 1913 errln("Test file \"%s\", line %d, unexpected break found at position %d", 1914 testFileName, lineNumber, pos); 1915 break; 1916 } 1917 if (pos > expectedPos) { 1918 errln("Test file \"%s\", line %d, failed to find expected break at position %d", 1919 testFileName, lineNumber, expectedPos); 1920 break; 1921 } 1922 pos = bi->next(); 1923 expectedI++; 1924 } 1925 1926 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) { 1927 errln("Test file \"%s\", line %d, failed to find expected break at position %d", 1928 testFileName, lineNumber, breakPositions->elementAti(expectedI)); 1929 } 1930 } 1931 1932 1933 1934 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 1935 //--------------------------------------------------------------------------------------- 1936 // 1937 // classs RBBIMonkeyKind 1938 // 1939 // Monkey Test for Break Iteration 1940 // Abstract interface class. Concrete derived classes independently 1941 // implement the break rules for different iterator types. 1942 // 1943 // The Monkey Test itself uses doesn't know which type of break iterator it is 1944 // testing, but works purely in terms of the interface defined here. 1945 // 1946 //--------------------------------------------------------------------------------------- 1947 class RBBIMonkeyKind { 1948 public: 1949 // Return a UVector of UnicodeSets, representing the character classes used 1950 // for this type of iterator. 1951 virtual UVector *charClasses() = 0; 1952 1953 // Set the test text on which subsequent calls to next() will operate 1954 virtual void setText(const UnicodeString &s) = 0; 1955 1956 // Find the next break postion, starting from the prev break position, or from zero. 1957 // Return -1 after reaching end of string. 1958 virtual int32_t next(int32_t i) = 0; 1959 1960 virtual ~RBBIMonkeyKind(); 1961 UErrorCode deferredStatus; 1962 1963 1964 protected: 1965 RBBIMonkeyKind(); 1966 1967 private: 1968 }; 1969 1970 RBBIMonkeyKind::RBBIMonkeyKind() { 1971 deferredStatus = U_ZERO_ERROR; 1972 } 1973 1974 RBBIMonkeyKind::~RBBIMonkeyKind() { 1975 } 1976 1977 1978 //---------------------------------------------------------------------------------------- 1979 // 1980 // Random Numbers. Similar to standard lib rand() and srand() 1981 // Not using library to 1982 // 1. Get same results on all platforms. 1983 // 2. Get access to current seed, to more easily reproduce failures. 1984 // 1985 //--------------------------------------------------------------------------------------- 1986 static uint32_t m_seed = 1; 1987 1988 static uint32_t m_rand() 1989 { 1990 m_seed = m_seed * 1103515245 + 12345; 1991 return (uint32_t)(m_seed/65536) % 32768; 1992 } 1993 1994 1995 //------------------------------------------------------------------------------------------ 1996 // 1997 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation 1998 // of RBBIMonkeyKind. 1999 // 2000 //------------------------------------------------------------------------------------------ 2001 class RBBICharMonkey: public RBBIMonkeyKind { 2002 public: 2003 RBBICharMonkey(); 2004 virtual ~RBBICharMonkey(); 2005 virtual UVector *charClasses(); 2006 virtual void setText(const UnicodeString &s); 2007 virtual int32_t next(int32_t i); 2008 private: 2009 UVector *fSets; 2010 2011 UnicodeSet *fCRLFSet; 2012 UnicodeSet *fControlSet; 2013 UnicodeSet *fExtendSet; 2014 UnicodeSet *fRegionalIndicatorSet; 2015 UnicodeSet *fPrependSet; 2016 UnicodeSet *fSpacingSet; 2017 UnicodeSet *fLSet; 2018 UnicodeSet *fVSet; 2019 UnicodeSet *fTSet; 2020 UnicodeSet *fLVSet; 2021 UnicodeSet *fLVTSet; 2022 UnicodeSet *fHangulSet; 2023 UnicodeSet *fAnySet; 2024 2025 const UnicodeString *fText; 2026 }; 2027 2028 2029 RBBICharMonkey::RBBICharMonkey() { 2030 UErrorCode status = U_ZERO_ERROR; 2031 2032 fText = NULL; 2033 2034 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status); 2035 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status); 2036 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status); 2037 fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status); 2038 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status); 2039 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status); 2040 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status); 2041 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status); 2042 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status); 2043 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status); 2044 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status); 2045 fHangulSet = new UnicodeSet(); 2046 fHangulSet->addAll(*fLSet); 2047 fHangulSet->addAll(*fVSet); 2048 fHangulSet->addAll(*fTSet); 2049 fHangulSet->addAll(*fLVSet); 2050 fHangulSet->addAll(*fLVTSet); 2051 fAnySet = new UnicodeSet(0, 0x10ffff); 2052 2053 fSets = new UVector(status); 2054 fSets->addElement(fCRLFSet, status); 2055 fSets->addElement(fControlSet, status); 2056 fSets->addElement(fExtendSet, status); 2057 fSets->addElement(fRegionalIndicatorSet, status); 2058 if (!fPrependSet->isEmpty()) { 2059 fSets->addElement(fPrependSet, status); 2060 } 2061 fSets->addElement(fSpacingSet, status); 2062 fSets->addElement(fHangulSet, status); 2063 fSets->addElement(fAnySet, status); 2064 if (U_FAILURE(status)) { 2065 deferredStatus = status; 2066 } 2067 } 2068 2069 2070 void RBBICharMonkey::setText(const UnicodeString &s) { 2071 fText = &s; 2072 } 2073 2074 2075 2076 int32_t RBBICharMonkey::next(int32_t prevPos) { 2077 int p0, p1, p2, p3; // Indices of the significant code points around the 2078 // break position being tested. The candidate break 2079 // location is before p2. 2080 2081 int breakPos = -1; 2082 2083 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 2084 2085 if (U_FAILURE(deferredStatus)) { 2086 return -1; 2087 } 2088 2089 // Previous break at end of string. return DONE. 2090 if (prevPos >= fText->length()) { 2091 return -1; 2092 } 2093 p0 = p1 = p2 = p3 = prevPos; 2094 c3 = fText->char32At(prevPos); 2095 c0 = c1 = c2 = 0; 2096 (void)p0; // suppress set but not used warning. 2097 (void)c0; 2098 2099 // Loop runs once per "significant" character position in the input text. 2100 for (;;) { 2101 // Move all of the positions forward in the input string. 2102 p0 = p1; c0 = c1; 2103 p1 = p2; c1 = c2; 2104 p2 = p3; c2 = c3; 2105 2106 // Advancd p3 by one codepoint 2107 p3 = fText->moveIndex32(p3, 1); 2108 c3 = fText->char32At(p3); 2109 2110 if (p1 == p2) { 2111 // Still warming up the loop. (won't work with zero length strings, but we don't care) 2112 continue; 2113 } 2114 if (p2 == fText->length()) { 2115 // Reached end of string. Always a break position. 2116 break; 2117 } 2118 2119 // Rule GB3 CR x LF 2120 // No Extend or Format characters may appear between the CR and LF, 2121 // which requires the additional check for p2 immediately following p1. 2122 // 2123 if (c1==0x0D && c2==0x0A && p1==(p2-1)) { 2124 continue; 2125 } 2126 2127 // Rule (GB4). ( Control | CR | LF ) <break> 2128 if (fControlSet->contains(c1) || 2129 c1 == 0x0D || 2130 c1 == 0x0A) { 2131 break; 2132 } 2133 2134 // Rule (GB5) <break> ( Control | CR | LF ) 2135 // 2136 if (fControlSet->contains(c2) || 2137 c2 == 0x0D || 2138 c2 == 0x0A) { 2139 break; 2140 } 2141 2142 2143 // Rule (GB6) L x ( L | V | LV | LVT ) 2144 if (fLSet->contains(c1) && 2145 (fLSet->contains(c2) || 2146 fVSet->contains(c2) || 2147 fLVSet->contains(c2) || 2148 fLVTSet->contains(c2))) { 2149 continue; 2150 } 2151 2152 // Rule (GB7) ( LV | V ) x ( V | T ) 2153 if ((fLVSet->contains(c1) || fVSet->contains(c1)) && 2154 (fVSet->contains(c2) || fTSet->contains(c2))) { 2155 continue; 2156 } 2157 2158 // Rule (GB8) ( LVT | T) x T 2159 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) && 2160 fTSet->contains(c2)) { 2161 continue; 2162 } 2163 2164 // Rule (GB8a) Regional_Indicator x Regional_Indicator 2165 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) { 2166 continue; 2167 } 2168 2169 // Rule (GB9) Numeric x ALetter 2170 if (fExtendSet->contains(c2)) { 2171 continue; 2172 } 2173 2174 // Rule (GB9a) x SpacingMark 2175 if (fSpacingSet->contains(c2)) { 2176 continue; 2177 } 2178 2179 // Rule (GB9b) Prepend x 2180 if (fPrependSet->contains(c1)) { 2181 continue; 2182 } 2183 2184 // Rule (GB10) Any <break> Any 2185 break; 2186 } 2187 2188 breakPos = p2; 2189 return breakPos; 2190 } 2191 2192 2193 2194 UVector *RBBICharMonkey::charClasses() { 2195 return fSets; 2196 } 2197 2198 2199 RBBICharMonkey::~RBBICharMonkey() { 2200 delete fSets; 2201 delete fCRLFSet; 2202 delete fControlSet; 2203 delete fExtendSet; 2204 delete fRegionalIndicatorSet; 2205 delete fPrependSet; 2206 delete fSpacingSet; 2207 delete fLSet; 2208 delete fVSet; 2209 delete fTSet; 2210 delete fLVSet; 2211 delete fLVTSet; 2212 delete fHangulSet; 2213 delete fAnySet; 2214 } 2215 2216 //------------------------------------------------------------------------------------------ 2217 // 2218 // class RBBIWordMonkey Word Break specific implementation 2219 // of RBBIMonkeyKind. 2220 // 2221 //------------------------------------------------------------------------------------------ 2222 class RBBIWordMonkey: public RBBIMonkeyKind { 2223 public: 2224 RBBIWordMonkey(); 2225 virtual ~RBBIWordMonkey(); 2226 virtual UVector *charClasses(); 2227 virtual void setText(const UnicodeString &s); 2228 virtual int32_t next(int32_t i); 2229 private: 2230 UVector *fSets; 2231 2232 UnicodeSet *fCRSet; 2233 UnicodeSet *fLFSet; 2234 UnicodeSet *fNewlineSet; 2235 UnicodeSet *fRegionalIndicatorSet; 2236 UnicodeSet *fKatakanaSet; 2237 UnicodeSet *fHebrew_LetterSet; 2238 UnicodeSet *fALetterSet; 2239 // TODO(jungshik): Do we still need this change? 2240 // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt 2241 UnicodeSet *fSingle_QuoteSet; 2242 UnicodeSet *fDouble_QuoteSet; 2243 UnicodeSet *fMidNumLetSet; 2244 UnicodeSet *fMidLetterSet; 2245 UnicodeSet *fMidNumSet; 2246 UnicodeSet *fNumericSet; 2247 UnicodeSet *fFormatSet; 2248 UnicodeSet *fOtherSet; 2249 UnicodeSet *fExtendSet; 2250 UnicodeSet *fExtendNumLetSet; 2251 UnicodeSet *fDictionaryCjkSet; 2252 2253 const UnicodeString *fText; 2254 }; 2255 2256 2257 RBBIWordMonkey::RBBIWordMonkey() 2258 { 2259 UErrorCode status = U_ZERO_ERROR; 2260 2261 fSets = new UVector(status); 2262 2263 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status); 2264 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status); 2265 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status); 2266 fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status); 2267 // Exclude Hangul syllables from ALetterSet during testing. 2268 // Leave CJK dictionary characters out from the monkey tests! 2269 #if 0 2270 fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}" 2271 "[\\p{Line_Break = Complex_Context}" 2272 "-\\p{Grapheme_Cluster_Break = Extend}" 2273 "-\\p{Grapheme_Cluster_Break = Control}" 2274 "]]", 2275 status); 2276 #endif 2277 fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status); 2278 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status); 2279 fHebrew_LetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status); 2280 fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status); 2281 fALetterSet->removeAll(*fDictionaryCjkSet); 2282 fSingle_QuoteSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"), status); 2283 fDouble_QuoteSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"), status); 2284 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status); 2285 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status); 2286 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status); 2287 // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test 2288 // we should figure out why 2289 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status); 2290 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status); 2291 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status); 2292 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status); 2293 2294 fOtherSet = new UnicodeSet(); 2295 if(U_FAILURE(status)) { 2296 deferredStatus = status; 2297 return; 2298 } 2299 2300 fOtherSet->complement(); 2301 fOtherSet->removeAll(*fCRSet); 2302 fOtherSet->removeAll(*fLFSet); 2303 fOtherSet->removeAll(*fNewlineSet); 2304 fOtherSet->removeAll(*fKatakanaSet); 2305 fOtherSet->removeAll(*fHebrew_LetterSet); 2306 fOtherSet->removeAll(*fALetterSet); 2307 fOtherSet->removeAll(*fSingle_QuoteSet); 2308 fOtherSet->removeAll(*fDouble_QuoteSet); 2309 fOtherSet->removeAll(*fMidLetterSet); 2310 fOtherSet->removeAll(*fMidNumSet); 2311 fOtherSet->removeAll(*fNumericSet); 2312 fOtherSet->removeAll(*fExtendNumLetSet); 2313 fOtherSet->removeAll(*fFormatSet); 2314 fOtherSet->removeAll(*fExtendSet); 2315 fOtherSet->removeAll(*fRegionalIndicatorSet); 2316 // Inhibit dictionary characters from being tested at all. 2317 fOtherSet->removeAll(*fDictionaryCjkSet); 2318 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status)); 2319 2320 fSets->addElement(fCRSet, status); 2321 fSets->addElement(fLFSet, status); 2322 fSets->addElement(fNewlineSet, status); 2323 fSets->addElement(fRegionalIndicatorSet, status); 2324 fSets->addElement(fHebrew_LetterSet, status); 2325 fSets->addElement(fALetterSet, status); 2326 fSets->addElement(fSingle_QuoteSet, status); 2327 fSets->addElement(fDouble_QuoteSet, status); 2328 //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test katakana 2329 fSets->addElement(fMidLetterSet, status); 2330 fSets->addElement(fMidNumLetSet, status); 2331 fSets->addElement(fMidNumSet, status); 2332 fSets->addElement(fNumericSet, status); 2333 fSets->addElement(fFormatSet, status); 2334 fSets->addElement(fExtendSet, status); 2335 fSets->addElement(fOtherSet, status); 2336 fSets->addElement(fExtendNumLetSet, status); 2337 2338 if (U_FAILURE(status)) { 2339 deferredStatus = status; 2340 } 2341 } 2342 2343 void RBBIWordMonkey::setText(const UnicodeString &s) { 2344 fText = &s; 2345 } 2346 2347 2348 int32_t RBBIWordMonkey::next(int32_t prevPos) { 2349 int p0, p1, p2, p3; // Indices of the significant code points around the 2350 // break position being tested. The candidate break 2351 // location is before p2. 2352 2353 int breakPos = -1; 2354 2355 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 2356 2357 if (U_FAILURE(deferredStatus)) { 2358 return -1; 2359 } 2360 2361 // Prev break at end of string. return DONE. 2362 if (prevPos >= fText->length()) { 2363 return -1; 2364 } 2365 p0 = p1 = p2 = p3 = prevPos; 2366 c3 = fText->char32At(prevPos); 2367 c0 = c1 = c2 = 0; 2368 (void)p0; // Suppress set but not used warning. 2369 2370 // Loop runs once per "significant" character position in the input text. 2371 for (;;) { 2372 // Move all of the positions forward in the input string. 2373 p0 = p1; c0 = c1; 2374 p1 = p2; c1 = c2; 2375 p2 = p3; c2 = c3; 2376 2377 // Advancd p3 by X(Extend | Format)* Rule 4 2378 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change) 2379 do { 2380 p3 = fText->moveIndex32(p3, 1); 2381 c3 = fText->char32At(p3); 2382 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { 2383 break; 2384 }; 2385 } 2386 while (fFormatSet->contains(c3) || fExtendSet->contains(c3)); 2387 2388 2389 if (p1 == p2) { 2390 // Still warming up the loop. (won't work with zero length strings, but we don't care) 2391 continue; 2392 } 2393 if (p2 == fText->length()) { 2394 // Reached end of string. Always a break position. 2395 break; 2396 } 2397 2398 // Rule (3) CR x LF 2399 // No Extend or Format characters may appear between the CR and LF, 2400 // which requires the additional check for p2 immediately following p1. 2401 // 2402 if (c1==0x0D && c2==0x0A) { 2403 continue; 2404 } 2405 2406 // Rule (3a) Break before and after newlines (including CR and LF) 2407 // 2408 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) { 2409 break; 2410 }; 2411 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { 2412 break; 2413 }; 2414 2415 // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter) 2416 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) && 2417 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) { 2418 continue; 2419 } 2420 2421 // Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter) 2422 // 2423 if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) && 2424 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) && 2425 (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) { 2426 continue; 2427 } 2428 2429 // Rule (7) (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter) 2430 if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) && 2431 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) && 2432 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) { 2433 continue; 2434 } 2435 2436 // Rule (7a) Hebrew_Letter x Single_Quote 2437 if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) { 2438 continue; 2439 } 2440 2441 // Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter 2442 if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) { 2443 continue; 2444 } 2445 2446 // Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter 2447 if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) { 2448 continue; 2449 } 2450 2451 // Rule (8) Numeric x Numeric 2452 if (fNumericSet->contains(c1) && 2453 fNumericSet->contains(c2)) { 2454 continue; 2455 } 2456 2457 // Rule (9) (ALetter | Hebrew_Letter) x Numeric 2458 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) && 2459 fNumericSet->contains(c2)) { 2460 continue; 2461 } 2462 2463 // Rule (10) Numeric x (ALetter | Hebrew_Letter) 2464 if (fNumericSet->contains(c1) && 2465 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) { 2466 continue; 2467 } 2468 2469 // Rule (11) Numeric (MidNum | MidNumLet | Single_Quote) x Numeric 2470 if (fNumericSet->contains(c0) && 2471 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) && 2472 fNumericSet->contains(c2)) { 2473 continue; 2474 } 2475 2476 // Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric 2477 if (fNumericSet->contains(c1) && 2478 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) && 2479 fNumericSet->contains(c3)) { 2480 continue; 2481 } 2482 2483 // Rule (13) Katakana x Katakana 2484 if (fKatakanaSet->contains(c1) && 2485 fKatakanaSet->contains(c2)) { 2486 continue; 2487 } 2488 2489 // Rule 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet 2490 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) || 2491 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) && 2492 fExtendNumLetSet->contains(c2)) { 2493 continue; 2494 } 2495 2496 // Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana) 2497 if (fExtendNumLetSet->contains(c1) && 2498 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) || 2499 fNumericSet->contains(c2) || fKatakanaSet->contains(c2))) { 2500 continue; 2501 } 2502 2503 // Rule 13c 2504 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) { 2505 continue; 2506 } 2507 2508 // Rule 14. Break found here. 2509 break; 2510 } 2511 2512 breakPos = p2; 2513 return breakPos; 2514 } 2515 2516 2517 UVector *RBBIWordMonkey::charClasses() { 2518 return fSets; 2519 } 2520 2521 2522 RBBIWordMonkey::~RBBIWordMonkey() { 2523 delete fSets; 2524 delete fCRSet; 2525 delete fLFSet; 2526 delete fNewlineSet; 2527 delete fKatakanaSet; 2528 delete fHebrew_LetterSet; 2529 delete fALetterSet; 2530 delete fSingle_QuoteSet; 2531 delete fDouble_QuoteSet; 2532 delete fMidNumLetSet; 2533 delete fMidLetterSet; 2534 delete fMidNumSet; 2535 delete fNumericSet; 2536 delete fFormatSet; 2537 delete fExtendSet; 2538 delete fExtendNumLetSet; 2539 delete fRegionalIndicatorSet; 2540 delete fDictionaryCjkSet; 2541 delete fOtherSet; 2542 } 2543 2544 2545 2546 2547 //------------------------------------------------------------------------------------------ 2548 // 2549 // class RBBISentMonkey Sentence Break specific implementation 2550 // of RBBIMonkeyKind. 2551 // 2552 //------------------------------------------------------------------------------------------ 2553 class RBBISentMonkey: public RBBIMonkeyKind { 2554 public: 2555 RBBISentMonkey(); 2556 virtual ~RBBISentMonkey(); 2557 virtual UVector *charClasses(); 2558 virtual void setText(const UnicodeString &s); 2559 virtual int32_t next(int32_t i); 2560 private: 2561 int moveBack(int posFrom); 2562 int moveForward(int posFrom); 2563 UChar32 cAt(int pos); 2564 2565 UVector *fSets; 2566 2567 UnicodeSet *fSepSet; 2568 UnicodeSet *fFormatSet; 2569 UnicodeSet *fSpSet; 2570 UnicodeSet *fLowerSet; 2571 UnicodeSet *fUpperSet; 2572 UnicodeSet *fOLetterSet; 2573 UnicodeSet *fNumericSet; 2574 UnicodeSet *fATermSet; 2575 UnicodeSet *fSContinueSet; 2576 UnicodeSet *fSTermSet; 2577 UnicodeSet *fCloseSet; 2578 UnicodeSet *fOtherSet; 2579 UnicodeSet *fExtendSet; 2580 2581 const UnicodeString *fText; 2582 2583 }; 2584 2585 RBBISentMonkey::RBBISentMonkey() 2586 { 2587 UErrorCode status = U_ZERO_ERROR; 2588 2589 fSets = new UVector(status); 2590 2591 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator 2592 // set and made into character classes of their own. For the monkey impl, 2593 // they remain in SEP, since Sep always appears with CR and LF in the rules. 2594 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status); 2595 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status); 2596 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status); 2597 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status); 2598 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status); 2599 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status); 2600 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status); 2601 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status); 2602 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status); 2603 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status); 2604 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status); 2605 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status); 2606 fOtherSet = new UnicodeSet(); 2607 2608 if(U_FAILURE(status)) { 2609 deferredStatus = status; 2610 return; 2611 } 2612 2613 fOtherSet->complement(); 2614 fOtherSet->removeAll(*fSepSet); 2615 fOtherSet->removeAll(*fFormatSet); 2616 fOtherSet->removeAll(*fSpSet); 2617 fOtherSet->removeAll(*fLowerSet); 2618 fOtherSet->removeAll(*fUpperSet); 2619 fOtherSet->removeAll(*fOLetterSet); 2620 fOtherSet->removeAll(*fNumericSet); 2621 fOtherSet->removeAll(*fATermSet); 2622 fOtherSet->removeAll(*fSContinueSet); 2623 fOtherSet->removeAll(*fSTermSet); 2624 fOtherSet->removeAll(*fCloseSet); 2625 fOtherSet->removeAll(*fExtendSet); 2626 2627 fSets->addElement(fSepSet, status); 2628 fSets->addElement(fFormatSet, status); 2629 fSets->addElement(fSpSet, status); 2630 fSets->addElement(fLowerSet, status); 2631 fSets->addElement(fUpperSet, status); 2632 fSets->addElement(fOLetterSet, status); 2633 fSets->addElement(fNumericSet, status); 2634 fSets->addElement(fATermSet, status); 2635 fSets->addElement(fSContinueSet, status); 2636 fSets->addElement(fSTermSet, status); 2637 fSets->addElement(fCloseSet, status); 2638 fSets->addElement(fOtherSet, status); 2639 fSets->addElement(fExtendSet, status); 2640 2641 if (U_FAILURE(status)) { 2642 deferredStatus = status; 2643 } 2644 } 2645 2646 2647 2648 void RBBISentMonkey::setText(const UnicodeString &s) { 2649 fText = &s; 2650 } 2651 2652 UVector *RBBISentMonkey::charClasses() { 2653 return fSets; 2654 } 2655 2656 2657 // moveBack() Find the "significant" code point preceding the index i. 2658 // Skips over ($Extend | $Format)* . 2659 // 2660 int RBBISentMonkey::moveBack(int i) { 2661 if (i <= 0) { 2662 return -1; 2663 } 2664 UChar32 c; 2665 int32_t j = i; 2666 do { 2667 j = fText->moveIndex32(j, -1); 2668 c = fText->char32At(j); 2669 } 2670 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c))); 2671 return j; 2672 2673 } 2674 2675 2676 int RBBISentMonkey::moveForward(int i) { 2677 if (i>=fText->length()) { 2678 return fText->length(); 2679 } 2680 UChar32 c; 2681 int32_t j = i; 2682 do { 2683 j = fText->moveIndex32(j, 1); 2684 c = cAt(j); 2685 } 2686 while (fFormatSet->contains(c) || fExtendSet->contains(c)); 2687 return j; 2688 } 2689 2690 UChar32 RBBISentMonkey::cAt(int pos) { 2691 if (pos<0 || pos>=fText->length()) { 2692 return -1; 2693 } else { 2694 return fText->char32At(pos); 2695 } 2696 } 2697 2698 int32_t RBBISentMonkey::next(int32_t prevPos) { 2699 int p0, p1, p2, p3; // Indices of the significant code points around the 2700 // break position being tested. The candidate break 2701 // location is before p2. 2702 2703 int breakPos = -1; 2704 2705 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 2706 UChar32 c; 2707 2708 if (U_FAILURE(deferredStatus)) { 2709 return -1; 2710 } 2711 2712 // Prev break at end of string. return DONE. 2713 if (prevPos >= fText->length()) { 2714 return -1; 2715 } 2716 p0 = p1 = p2 = p3 = prevPos; 2717 c3 = fText->char32At(prevPos); 2718 c0 = c1 = c2 = 0; 2719 (void)p0; // Suppress set but not used warning. 2720 2721 // Loop runs once per "significant" character position in the input text. 2722 for (;;) { 2723 // Move all of the positions forward in the input string. 2724 p0 = p1; c0 = c1; 2725 p1 = p2; c1 = c2; 2726 p2 = p3; c2 = c3; 2727 2728 // Advancd p3 by X(Extend | Format)* Rule 4 2729 p3 = moveForward(p3); 2730 c3 = cAt(p3); 2731 2732 // Rule (3) CR x LF 2733 if (c1==0x0d && c2==0x0a && p2==(p1+1)) { 2734 continue; 2735 } 2736 2737 // Rule (4). Sep <break> 2738 if (fSepSet->contains(c1)) { 2739 p2 = p1+1; // Separators don't combine with Extend or Format. 2740 break; 2741 } 2742 2743 if (p2 >= fText->length()) { 2744 // Reached end of string. Always a break position. 2745 break; 2746 } 2747 2748 if (p2 == prevPos) { 2749 // Still warming up the loop. (won't work with zero length strings, but we don't care) 2750 continue; 2751 } 2752 2753 // Rule (6). ATerm x Numeric 2754 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) { 2755 continue; 2756 } 2757 2758 // Rule (7). Upper ATerm x Uppper 2759 if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) { 2760 continue; 2761 } 2762 2763 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower 2764 // Note: STerm | ATerm are added to the negated part of the expression by a 2765 // note to the Unicode 5.0 documents. 2766 int p8 = p1; 2767 while (fSpSet->contains(cAt(p8))) { 2768 p8 = moveBack(p8); 2769 } 2770 while (fCloseSet->contains(cAt(p8))) { 2771 p8 = moveBack(p8); 2772 } 2773 if (fATermSet->contains(cAt(p8))) { 2774 p8=p2; 2775 for (;;) { 2776 c = cAt(p8); 2777 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) || 2778 fLowerSet->contains(c) || fSepSet->contains(c) || 2779 fATermSet->contains(c) || fSTermSet->contains(c)) { 2780 break; 2781 } 2782 p8 = moveForward(p8); 2783 } 2784 if (fLowerSet->contains(cAt(p8))) { 2785 continue; 2786 } 2787 } 2788 2789 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm); 2790 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) { 2791 p8 = p1; 2792 while (fSpSet->contains(cAt(p8))) { 2793 p8 = moveBack(p8); 2794 } 2795 while (fCloseSet->contains(cAt(p8))) { 2796 p8 = moveBack(p8); 2797 } 2798 c = cAt(p8); 2799 if (fSTermSet->contains(c) || fATermSet->contains(c)) { 2800 continue; 2801 } 2802 } 2803 2804 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF) 2805 int p9 = p1; 2806 while (fCloseSet->contains(cAt(p9))) { 2807 p9 = moveBack(p9); 2808 } 2809 c = cAt(p9); 2810 if ((fSTermSet->contains(c) || fATermSet->contains(c))) { 2811 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) { 2812 continue; 2813 } 2814 } 2815 2816 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF) 2817 int p10 = p1; 2818 while (fSpSet->contains(cAt(p10))) { 2819 p10 = moveBack(p10); 2820 } 2821 while (fCloseSet->contains(cAt(p10))) { 2822 p10 = moveBack(p10); 2823 } 2824 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) { 2825 if (fSpSet->contains(c2) || fSepSet->contains(c2)) { 2826 continue; 2827 } 2828 } 2829 2830 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break> 2831 int p11 = p1; 2832 if (fSepSet->contains(cAt(p11))) { 2833 p11 = moveBack(p11); 2834 } 2835 while (fSpSet->contains(cAt(p11))) { 2836 p11 = moveBack(p11); 2837 } 2838 while (fCloseSet->contains(cAt(p11))) { 2839 p11 = moveBack(p11); 2840 } 2841 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) { 2842 break; 2843 } 2844 2845 // Rule (12) Any x Any 2846 continue; 2847 } 2848 breakPos = p2; 2849 return breakPos; 2850 } 2851 2852 RBBISentMonkey::~RBBISentMonkey() { 2853 delete fSets; 2854 delete fSepSet; 2855 delete fFormatSet; 2856 delete fSpSet; 2857 delete fLowerSet; 2858 delete fUpperSet; 2859 delete fOLetterSet; 2860 delete fNumericSet; 2861 delete fATermSet; 2862 delete fSContinueSet; 2863 delete fSTermSet; 2864 delete fCloseSet; 2865 delete fOtherSet; 2866 delete fExtendSet; 2867 } 2868 2869 2870 2871 //------------------------------------------------------------------------------------------- 2872 // 2873 // RBBILineMonkey 2874 // 2875 //------------------------------------------------------------------------------------------- 2876 2877 class RBBILineMonkey: public RBBIMonkeyKind { 2878 public: 2879 RBBILineMonkey(); 2880 virtual ~RBBILineMonkey(); 2881 virtual UVector *charClasses(); 2882 virtual void setText(const UnicodeString &s); 2883 virtual int32_t next(int32_t i); 2884 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar); 2885 private: 2886 UVector *fSets; 2887 2888 UnicodeSet *fBK; 2889 UnicodeSet *fCR; 2890 UnicodeSet *fLF; 2891 UnicodeSet *fCM; 2892 UnicodeSet *fNL; 2893 UnicodeSet *fSG; 2894 UnicodeSet *fWJ; 2895 UnicodeSet *fZW; 2896 UnicodeSet *fGL; 2897 UnicodeSet *fCB; 2898 UnicodeSet *fSP; 2899 UnicodeSet *fB2; 2900 UnicodeSet *fBA; 2901 UnicodeSet *fBB; 2902 UnicodeSet *fHY; 2903 UnicodeSet *fH2; 2904 UnicodeSet *fH3; 2905 UnicodeSet *fCL; 2906 UnicodeSet *fCP; 2907 UnicodeSet *fEX; 2908 UnicodeSet *fIN; 2909 UnicodeSet *fJL; 2910 UnicodeSet *fJV; 2911 UnicodeSet *fJT; 2912 UnicodeSet *fNS; 2913 UnicodeSet *fOP; 2914 UnicodeSet *fQU; 2915 UnicodeSet *fIS; 2916 UnicodeSet *fNU; 2917 UnicodeSet *fPO; 2918 UnicodeSet *fPR; 2919 UnicodeSet *fSY; 2920 UnicodeSet *fAI; 2921 UnicodeSet *fAL; 2922 UnicodeSet *fCJ; 2923 UnicodeSet *fHL; 2924 UnicodeSet *fID; 2925 UnicodeSet *fRI; 2926 UnicodeSet *fSA; 2927 UnicodeSet *fXX; 2928 2929 BreakIterator *fCharBI; 2930 const UnicodeString *fText; 2931 RegexMatcher *fNumberMatcher; 2932 }; 2933 2934 2935 RBBILineMonkey::RBBILineMonkey() 2936 { 2937 UErrorCode status = U_ZERO_ERROR; 2938 2939 fSets = new UVector(status); 2940 2941 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status); 2942 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status); 2943 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status); 2944 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status); 2945 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status); 2946 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status); 2947 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status); 2948 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status); 2949 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status); 2950 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status); 2951 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status); 2952 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status); 2953 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status); 2954 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status); 2955 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status); 2956 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status); 2957 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status); 2958 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status); 2959 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status); 2960 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status); 2961 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status); 2962 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status); 2963 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status); 2964 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status); 2965 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status); 2966 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status); 2967 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status); 2968 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status); 2969 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status); 2970 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status); 2971 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status); 2972 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status); 2973 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status); 2974 fCJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status); 2975 fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status); 2976 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status); 2977 fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status); 2978 fSA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status); 2979 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status); 2980 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status); 2981 2982 if (U_FAILURE(status)) { 2983 deferredStatus = status; 2984 fCharBI = NULL; 2985 fNumberMatcher = NULL; 2986 return; 2987 } 2988 2989 fAL->addAll(*fXX); // Default behavior for XX is identical to AL 2990 fAL->addAll(*fAI); // Default behavior for AI is identical to AL 2991 fAL->addAll(*fSA); // Default behavior for SA is XX, which defaults to AL 2992 fAL->addAll(*fSG); // Default behavior for SG is identical to AL. 2993 2994 fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS. 2995 2996 fSets->addElement(fBK, status); 2997 fSets->addElement(fCR, status); 2998 fSets->addElement(fLF, status); 2999 fSets->addElement(fCM, status); 3000 fSets->addElement(fNL, status); 3001 fSets->addElement(fWJ, status); 3002 fSets->addElement(fZW, status); 3003 fSets->addElement(fGL, status); 3004 fSets->addElement(fCB, status); 3005 fSets->addElement(fSP, status); 3006 fSets->addElement(fB2, status); 3007 fSets->addElement(fBA, status); 3008 fSets->addElement(fBB, status); 3009 fSets->addElement(fHY, status); 3010 fSets->addElement(fH2, status); 3011 fSets->addElement(fH3, status); 3012 fSets->addElement(fCL, status); 3013 fSets->addElement(fCP, status); 3014 fSets->addElement(fEX, status); 3015 fSets->addElement(fIN, status); 3016 fSets->addElement(fJL, status); 3017 fSets->addElement(fJT, status); 3018 fSets->addElement(fJV, status); 3019 fSets->addElement(fNS, status); 3020 fSets->addElement(fOP, status); 3021 fSets->addElement(fQU, status); 3022 fSets->addElement(fIS, status); 3023 fSets->addElement(fNU, status); 3024 fSets->addElement(fPO, status); 3025 fSets->addElement(fPR, status); 3026 fSets->addElement(fSY, status); 3027 fSets->addElement(fAI, status); 3028 fSets->addElement(fAL, status); 3029 fSets->addElement(fHL, status); 3030 fSets->addElement(fID, status); 3031 fSets->addElement(fWJ, status); 3032 fSets->addElement(fRI, status); 3033 fSets->addElement(fSA, status); 3034 fSets->addElement(fSG, status); 3035 3036 const char *rules = 3037 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?" 3038 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?" 3039 "\\p{Line_Break=NU}\\p{Line_Break=CM}*" 3040 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*" 3041 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?" 3042 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"; 3043 3044 fNumberMatcher = new RegexMatcher( 3045 UnicodeString(rules, -1, US_INV), 0, status); 3046 3047 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status); 3048 3049 if (U_FAILURE(status)) { 3050 deferredStatus = status; 3051 } 3052 } 3053 3054 3055 void RBBILineMonkey::setText(const UnicodeString &s) { 3056 fText = &s; 3057 fCharBI->setText(s); 3058 fNumberMatcher->reset(s); 3059 } 3060 3061 // 3062 // rule9Adjust 3063 // Line Break TR rules 9 and 10 implementation. 3064 // This deals with combining marks and other sequences that 3065 // that must be treated as if they were something other than what they actually are. 3066 // 3067 // This is factored out into a separate function because it must be applied twice for 3068 // each potential break, once to the chars before the position being checked, then 3069 // again to the text following the possible break. 3070 // 3071 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) { 3072 if (pos == -1) { 3073 // Invalid initial position. Happens during the warmup iteration of the 3074 // main loop in next(). 3075 return; 3076 } 3077 3078 int32_t nPos = *nextPos; 3079 3080 // LB 9 Keep combining sequences together. 3081 // advance over any CM class chars. Note that Line Break CM is different 3082 // from the normal Grapheme Extend property. 3083 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d || 3084 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) { 3085 for (;;) { 3086 *nextChar = fText->char32At(nPos); 3087 if (!fCM->contains(*nextChar)) { 3088 break; 3089 } 3090 nPos = fText->moveIndex32(nPos, 1); 3091 } 3092 } 3093 3094 3095 // LB 9 Treat X CM* as if it were x. 3096 // No explicit action required. 3097 3098 // LB 10 Treat any remaining combining mark as AL 3099 if (fCM->contains(*posChar)) { 3100 *posChar = 0x41; // thisChar = 'A'; 3101 } 3102 3103 // Push the updated nextPos and nextChar back to our caller. 3104 // This only makes a difference if posChar got bigger by consuming a 3105 // combining sequence. 3106 *nextPos = nPos; 3107 *nextChar = fText->char32At(nPos); 3108 } 3109 3110 3111 3112 int32_t RBBILineMonkey::next(int32_t startPos) { 3113 UErrorCode status = U_ZERO_ERROR; 3114 int32_t pos; // Index of the char following a potential break position 3115 UChar32 thisChar; // Character at above position "pos" 3116 3117 int32_t prevPos; // Index of the char preceding a potential break position 3118 UChar32 prevChar; // Character at above position. Note that prevChar 3119 // and thisChar may not be adjacent because combining 3120 // characters between them will be ignored. 3121 3122 int32_t prevPosX2; // Second previous character. Wider context for LB21a. 3123 UChar32 prevCharX2; 3124 3125 int32_t nextPos; // Index of the next character following pos. 3126 // Usually skips over combining marks. 3127 int32_t nextCPPos; // Index of the code point following "pos." 3128 // May point to a combining mark. 3129 int32_t tPos; // temp value. 3130 UChar32 c; 3131 3132 if (U_FAILURE(deferredStatus)) { 3133 return -1; 3134 } 3135 3136 if (startPos >= fText->length()) { 3137 return -1; 3138 } 3139 3140 3141 // Initial values for loop. Loop will run the first time without finding breaks, 3142 // while the invalid values shift out and the "this" and 3143 // "prev" positions are filled in with good values. 3144 pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration. 3145 thisChar = prevChar = prevCharX2 = 0; 3146 nextPos = nextCPPos = startPos; 3147 3148 3149 // Loop runs once per position in the test text, until a break position 3150 // is found. 3151 for (;;) { 3152 prevPosX2 = prevPos; 3153 prevCharX2 = prevChar; 3154 3155 prevPos = pos; 3156 prevChar = thisChar; 3157 3158 pos = nextPos; 3159 thisChar = fText->char32At(pos); 3160 3161 nextCPPos = fText->moveIndex32(pos, 1); 3162 nextPos = nextCPPos; 3163 3164 // Rule LB2 - Break at end of text. 3165 if (pos >= fText->length()) { 3166 break; 3167 } 3168 3169 // Rule LB 9 - adjust for combining sequences. 3170 // We do this one out-of-order because the adjustment does not change anything 3171 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to 3172 // be applied. 3173 rule9Adjust(prevPos, &prevChar, &pos, &thisChar); 3174 nextCPPos = nextPos = fText->moveIndex32(pos, 1); 3175 c = fText->char32At(nextPos); 3176 rule9Adjust(pos, &thisChar, &nextPos, &c); 3177 3178 // If the loop is still warming up - if we haven't shifted the initial 3179 // -1 positions out of prevPos yet - loop back to advance the 3180 // position in the input without any further looking for breaks. 3181 if (prevPos == -1) { 3182 continue; 3183 } 3184 3185 // LB 4 Always break after hard line breaks, 3186 if (fBK->contains(prevChar)) { 3187 break; 3188 } 3189 3190 // LB 5 Break after CR, LF, NL, but not inside CR LF 3191 if (prevChar == 0x0d && thisChar == 0x0a) { 3192 continue; 3193 } 3194 if (prevChar == 0x0d || 3195 prevChar == 0x0a || 3196 prevChar == 0x85) { 3197 break; 3198 } 3199 3200 // LB 6 Don't break before hard line breaks 3201 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 || 3202 fBK->contains(thisChar)) { 3203 continue; 3204 } 3205 3206 3207 // LB 7 Don't break before spaces or zero-width space. 3208 if (fSP->contains(thisChar)) { 3209 continue; 3210 } 3211 3212 if (fZW->contains(thisChar)) { 3213 continue; 3214 } 3215 3216 // LB 8 Break after zero width space 3217 if (fZW->contains(prevChar)) { 3218 break; 3219 } 3220 3221 // LB 9, 10 Already done, at top of loop. 3222 // 3223 3224 3225 // LB 11 Do not break before or after WORD JOINER and related characters. 3226 // x WJ 3227 // WJ x 3228 // 3229 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) { 3230 continue; 3231 } 3232 3233 // LB 12 3234 // GL x 3235 if (fGL->contains(prevChar)) { 3236 continue; 3237 } 3238 3239 // LB 12a 3240 // [^SP BA HY] x GL 3241 if (!(fSP->contains(prevChar) || 3242 fBA->contains(prevChar) || 3243 fHY->contains(prevChar) ) && fGL->contains(thisChar)) { 3244 continue; 3245 } 3246 3247 3248 3249 // LB 13 Don't break before closings. 3250 // NU x CL, NU x CP and NU x IS are not matched here so that they will 3251 // fall into LB 17 and the more general number regular expression. 3252 // 3253 if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) || 3254 (!fNU->contains(prevChar) && fCP->contains(thisChar)) || 3255 fEX->contains(thisChar) || 3256 (!fNU->contains(prevChar) && fIS->contains(thisChar)) || 3257 (!fNU->contains(prevChar) && fSY->contains(thisChar))) { 3258 continue; 3259 } 3260 3261 // LB 14 Don't break after OP SP* 3262 // Scan backwards, checking for this sequence. 3263 // The OP char could include combining marks, so we actually check for 3264 // OP CM* SP* 3265 // Another Twist: The Rule 67 fixes may have changed a SP CM 3266 // sequence into a ID char, so before scanning back through spaces, 3267 // verify that prevChar is indeed a space. The prevChar variable 3268 // may differ from fText[prevPos] 3269 tPos = prevPos; 3270 if (fSP->contains(prevChar)) { 3271 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) { 3272 tPos=fText->moveIndex32(tPos, -1); 3273 } 3274 } 3275 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { 3276 tPos=fText->moveIndex32(tPos, -1); 3277 } 3278 if (fOP->contains(fText->char32At(tPos))) { 3279 continue; 3280 } 3281 3282 3283 // LB 15 QU SP* x OP 3284 if (fOP->contains(thisChar)) { 3285 // Scan backwards from prevChar to see if it is preceded by QU CM* SP* 3286 int tPos = prevPos; 3287 while (tPos>0 && fSP->contains(fText->char32At(tPos))) { 3288 tPos = fText->moveIndex32(tPos, -1); 3289 } 3290 while (tPos>0 && fCM->contains(fText->char32At(tPos))) { 3291 tPos = fText->moveIndex32(tPos, -1); 3292 } 3293 if (fQU->contains(fText->char32At(tPos))) { 3294 continue; 3295 } 3296 } 3297 3298 3299 3300 // LB 16 (CL | CP) SP* x NS 3301 // Scan backwards for SP* CM* (CL | CP) 3302 if (fNS->contains(thisChar)) { 3303 int tPos = prevPos; 3304 while (tPos>0 && fSP->contains(fText->char32At(tPos))) { 3305 tPos = fText->moveIndex32(tPos, -1); 3306 } 3307 while (tPos>0 && fCM->contains(fText->char32At(tPos))) { 3308 tPos = fText->moveIndex32(tPos, -1); 3309 } 3310 if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) { 3311 continue; 3312 } 3313 } 3314 3315 3316 // LB 17 B2 SP* x B2 3317 if (fB2->contains(thisChar)) { 3318 // Scan backwards, checking for the B2 CM* SP* sequence. 3319 tPos = prevPos; 3320 if (fSP->contains(prevChar)) { 3321 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) { 3322 tPos=fText->moveIndex32(tPos, -1); 3323 } 3324 } 3325 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { 3326 tPos=fText->moveIndex32(tPos, -1); 3327 } 3328 if (fB2->contains(fText->char32At(tPos))) { 3329 continue; 3330 } 3331 } 3332 3333 3334 // LB 18 break after space 3335 if (fSP->contains(prevChar)) { 3336 break; 3337 } 3338 3339 // LB 19 3340 // x QU 3341 // QU x 3342 if (fQU->contains(thisChar) || fQU->contains(prevChar)) { 3343 continue; 3344 } 3345 3346 // LB 20 Break around a CB 3347 if (fCB->contains(thisChar) || fCB->contains(prevChar)) { 3348 break; 3349 } 3350 3351 // LB 21 3352 if (fBA->contains(thisChar) || 3353 fHY->contains(thisChar) || 3354 fNS->contains(thisChar) || 3355 fBB->contains(prevChar) ) { 3356 continue; 3357 } 3358 3359 // LB 21a 3360 // HL (HY | BA) x 3361 if (fHL->contains(prevCharX2) && 3362 (fHY->contains(prevChar) || fBA->contains(prevChar))) { 3363 continue; 3364 } 3365 3366 // LB 21b 3367 // SY x HL 3368 if (fSY->contains(prevChar) && fHL->contains(thisChar)) { 3369 continue; 3370 } 3371 3372 // LB 22 3373 if ((fAL->contains(prevChar) && fIN->contains(thisChar)) || 3374 (fHL->contains(prevChar) && fIN->contains(thisChar)) || 3375 (fID->contains(prevChar) && fIN->contains(thisChar)) || 3376 (fIN->contains(prevChar) && fIN->contains(thisChar)) || 3377 (fNU->contains(prevChar) && fIN->contains(thisChar)) ) { 3378 continue; 3379 } 3380 3381 3382 // LB 23 ID x PO 3383 // AL x NU 3384 // HL x NU 3385 // NU x AL 3386 if ((fID->contains(prevChar) && fPO->contains(thisChar)) || 3387 (fAL->contains(prevChar) && fNU->contains(thisChar)) || 3388 (fHL->contains(prevChar) && fNU->contains(thisChar)) || 3389 (fNU->contains(prevChar) && fAL->contains(thisChar)) || 3390 (fNU->contains(prevChar) && fHL->contains(thisChar)) ) { 3391 continue; 3392 } 3393 3394 // LB 24 Do not break between prefix and letters or ideographs. 3395 // PR x ID 3396 // PR x (AL | HL) 3397 // PO x (AL | HL) 3398 if ((fPR->contains(prevChar) && fID->contains(thisChar)) || 3399 (fPR->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) || 3400 (fPO->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar)))) { 3401 continue; 3402 } 3403 3404 3405 3406 // LB 25 Numbers 3407 if (fNumberMatcher->lookingAt(prevPos, status)) { 3408 if (U_FAILURE(status)) { 3409 break; 3410 } 3411 // Matched a number. But could have been just a single digit, which would 3412 // not represent a "no break here" between prevChar and thisChar 3413 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num 3414 if (numEndIdx > pos) { 3415 // Number match includes at least our two chars being checked 3416 if (numEndIdx > nextPos) { 3417 // Number match includes additional chars. Update pos and nextPos 3418 // so that next loop iteration will continue at the end of the number, 3419 // checking for breaks between last char in number & whatever follows. 3420 pos = nextPos = numEndIdx; 3421 do { 3422 pos = fText->moveIndex32(pos, -1); 3423 thisChar = fText->char32At(pos); 3424 } while (fCM->contains(thisChar)); 3425 } 3426 continue; 3427 } 3428 } 3429 3430 3431 // LB 26 Do not break a Korean syllable. 3432 if (fJL->contains(prevChar) && (fJL->contains(thisChar) || 3433 fJV->contains(thisChar) || 3434 fH2->contains(thisChar) || 3435 fH3->contains(thisChar))) { 3436 continue; 3437 } 3438 3439 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) && 3440 (fJV->contains(thisChar) || fJT->contains(thisChar))) { 3441 continue; 3442 } 3443 3444 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) && 3445 fJT->contains(thisChar)) { 3446 continue; 3447 } 3448 3449 // LB 27 Treat a Korean Syllable Block the same as ID. 3450 if ((fJL->contains(prevChar) || fJV->contains(prevChar) || 3451 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) && 3452 fIN->contains(thisChar)) { 3453 continue; 3454 } 3455 if ((fJL->contains(prevChar) || fJV->contains(prevChar) || 3456 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) && 3457 fPO->contains(thisChar)) { 3458 continue; 3459 } 3460 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) || 3461 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) { 3462 continue; 3463 } 3464 3465 3466 3467 // LB 28 Do not break between alphabetics ("at"). 3468 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) { 3469 continue; 3470 } 3471 3472 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g."). 3473 if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) { 3474 continue; 3475 } 3476 3477 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation. 3478 // (AL | NU) x OP 3479 // CP x (AL | NU) 3480 if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) { 3481 continue; 3482 } 3483 if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) { 3484 continue; 3485 } 3486 3487 // LB30a Do not break between regional indicators. 3488 // RI x RI 3489 if (fRI->contains(prevChar) && fRI->contains(thisChar)) { 3490 continue; 3491 } 3492 3493 // LB 31 Break everywhere else 3494 break; 3495 3496 } 3497 3498 return pos; 3499 } 3500 3501 3502 UVector *RBBILineMonkey::charClasses() { 3503 return fSets; 3504 } 3505 3506 3507 RBBILineMonkey::~RBBILineMonkey() { 3508 delete fSets; 3509 3510 delete fBK; 3511 delete fCR; 3512 delete fLF; 3513 delete fCM; 3514 delete fNL; 3515 delete fWJ; 3516 delete fZW; 3517 delete fGL; 3518 delete fCB; 3519 delete fSP; 3520 delete fB2; 3521 delete fBA; 3522 delete fBB; 3523 delete fHY; 3524 delete fH2; 3525 delete fH3; 3526 delete fCL; 3527 delete fCP; 3528 delete fEX; 3529 delete fIN; 3530 delete fJL; 3531 delete fJV; 3532 delete fJT; 3533 delete fNS; 3534 delete fOP; 3535 delete fQU; 3536 delete fIS; 3537 delete fNU; 3538 delete fPO; 3539 delete fPR; 3540 delete fSY; 3541 delete fAI; 3542 delete fAL; 3543 delete fCJ; 3544 delete fHL; 3545 delete fID; 3546 delete fRI; 3547 delete fSA; 3548 delete fSG; 3549 delete fXX; 3550 3551 delete fCharBI; 3552 delete fNumberMatcher; 3553 } 3554 3555 3556 //------------------------------------------------------------------------------------------- 3557 // 3558 // TestMonkey 3559 // 3560 // params 3561 // seed=nnnnn Random number starting seed. 3562 // Setting the seed allows errors to be reproduced. 3563 // loop=nnn Looping count. Controls running time. 3564 // -1: run forever. 3565 // 0 or greater: run length. 3566 // 3567 // type = char | word | line | sent | title 3568 // 3569 //------------------------------------------------------------------------------------------- 3570 3571 static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) { 3572 int32_t val = defaultVal; 3573 name.append(" *= *(-?\\d+)"); 3574 UErrorCode status = U_ZERO_ERROR; 3575 RegexMatcher m(name, params, 0, status); 3576 if (m.find()) { 3577 // The param exists. Convert the string to an int. 3578 char valString[100]; 3579 int32_t paramLength = m.end(1, status) - m.start(1, status); 3580 if (paramLength >= (int32_t)(sizeof(valString)-1)) { 3581 paramLength = (int32_t)(sizeof(valString)-2); 3582 } 3583 params.extract(m.start(1, status), paramLength, valString, sizeof(valString)); 3584 val = strtol(valString, NULL, 10); 3585 3586 // Delete this parameter from the params string. 3587 m.reset(); 3588 params = m.replaceFirst("", status); 3589 } 3590 U_ASSERT(U_SUCCESS(status)); 3591 return val; 3592 } 3593 #endif 3594 3595 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 3596 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr, 3597 BreakIterator *bi, 3598 int expected[], 3599 int expectedcount) 3600 { 3601 int count = 0; 3602 int i = 0; 3603 int forward[50]; 3604 bi->setText(ustr); 3605 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) { 3606 forward[count] = i; 3607 if (count < expectedcount && expected[count] != i) { 3608 test->errln("break forward test failed: expected %d but got %d", 3609 expected[count], i); 3610 break; 3611 } 3612 count ++; 3613 } 3614 if (count != expectedcount) { 3615 printStringBreaks(ustr, expected, expectedcount); 3616 test->errln("break forward test failed: missed %d match", 3617 expectedcount - count); 3618 return; 3619 } 3620 // testing boundaries 3621 for (i = 1; i < expectedcount; i ++) { 3622 int j = expected[i - 1]; 3623 if (!bi->isBoundary(j)) { 3624 printStringBreaks(ustr, expected, expectedcount); 3625 test->errln("isBoundary() failed. Expected boundary at position %d", j); 3626 return; 3627 } 3628 for (j = expected[i - 1] + 1; j < expected[i]; j ++) { 3629 if (bi->isBoundary(j)) { 3630 printStringBreaks(ustr, expected, expectedcount); 3631 test->errln("isBoundary() failed. Not expecting boundary at position %d", j); 3632 return; 3633 } 3634 } 3635 } 3636 3637 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) { 3638 count --; 3639 if (forward[count] != i) { 3640 printStringBreaks(ustr, expected, expectedcount); 3641 test->errln("happy break test previous() failed: expected %d but got %d", 3642 forward[count], i); 3643 break; 3644 } 3645 } 3646 if (count != 0) { 3647 printStringBreaks(ustr, expected, expectedcount); 3648 test->errln("break test previous() failed: missed a match"); 3649 return; 3650 } 3651 3652 // testing preceding 3653 for (i = 0; i < expectedcount - 1; i ++) { 3654 // int j = expected[i] + 1; 3655 int j = ustr.moveIndex32(expected[i], 1); 3656 for (; j <= expected[i + 1]; j ++) { 3657 if (bi->preceding(j) != expected[i]) { 3658 printStringBreaks(ustr, expected, expectedcount); 3659 test->errln("preceding(): Not expecting boundary at position %d", j); 3660 return; 3661 } 3662 } 3663 } 3664 } 3665 #endif 3666 3667 void RBBITest::TestWordBreaks(void) 3668 { 3669 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 3670 3671 Locale locale("en"); 3672 UErrorCode status = U_ZERO_ERROR; 3673 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 3674 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 3675 // Replaced any C+J characters in a row with a random sequence of characters 3676 // of the same length to make our C+J segmentation not get in the way. 3677 static const char *strlist[] = 3678 { 3679 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d", 3680 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b", 3681 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a", 3682 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622", 3683 "\\uac00\\u3588\\u009c\\u0953\\u194b", 3684 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 3685 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e", 3686 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e", 3687 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 3688 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 3689 "\\u2027\\U000e0067\\u0a47\\u00b7", 3690 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 3691 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 3692 "\\u0589\\U000e006e\\u0a42\\U000104a5", 3693 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a", 3694 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 3695 "\\u0027\\u11af\\U000e0057\\u0602", 3696 "\\U0001d7f2\\U000e007\\u0004\\u0589", 3697 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b", 3698 "\\U0001d7f2\\U000e007d\\u0004\\u0589", 3699 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 3700 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 3701 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068", 3702 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 3703 "\\u0233\\U000e0020\\u0a69\\u0d6a", 3704 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 3705 "\\u18f4\\U000e0049\\u20e7\\u2027", 3706 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 3707 "\\ua183\\u102d\\u0bec\\u003a", 3708 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 3709 "\\u003a\\u0e57\\u0fad\\u002e", 3710 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", 3711 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", 3712 "\\U000e005d\\u2044\\u0731\\u0650\\u0061", 3713 "\\u003a\\u0664\\u00b7\\u1fba", 3714 "\\u003b\\u0027\\u00b7\\u47a3", 3715 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b", 3716 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673", 3717 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c", 3718 }; 3719 int loop; 3720 if (U_FAILURE(status)) { 3721 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 3722 return; 3723 } 3724 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 3725 // printf("looping %d\n", loop); 3726 UnicodeString ustr = CharsToUnicodeString(strlist[loop]); 3727 // RBBICharMonkey monkey; 3728 RBBIWordMonkey monkey; 3729 3730 int expected[50]; 3731 int expectedcount = 0; 3732 3733 monkey.setText(ustr); 3734 int i; 3735 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 3736 expected[expectedcount ++] = i; 3737 } 3738 3739 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 3740 } 3741 delete bi; 3742 #endif 3743 } 3744 3745 void RBBITest::TestWordBoundary(void) 3746 { 3747 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data> 3748 Locale locale("en"); 3749 UErrorCode status = U_ZERO_ERROR; 3750 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 3751 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 3752 UChar str[50]; 3753 static const char *strlist[] = 3754 { 3755 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 3756 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 3757 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 3758 "\\u2027\\U000e0067\\u0a47\\u00b7", 3759 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 3760 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 3761 "\\u0589\\U000e006e\\u0a42\\U000104a5", 3762 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a", 3763 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 3764 "\\u0027\\u11af\\U000e0057\\u0602", 3765 "\\U0001d7f2\\U000e007\\u0004\\u0589", 3766 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b", 3767 "\\U0001d7f2\\U000e007d\\u0004\\u0589", 3768 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 3769 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 3770 "\\U000e0065\\u302c\\u09ee\\U000e0068", 3771 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 3772 "\\u0233\\U000e0020\\u0a69\\u0d6a", 3773 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 3774 "\\u58f4\\U000e0049\\u20e7\\u2027", 3775 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 3776 "\\ua183\\u102d\\u0bec\\u003a", 3777 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 3778 "\\u003a\\u0e57\\u0fad\\u002e", 3779 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", 3780 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", 3781 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019", 3782 "\\u003a\\u0664\\u00b7\\u1fba", 3783 "\\u003b\\u0027\\u00b7\\u47a3", 3784 }; 3785 int loop; 3786 if (U_FAILURE(status)) { 3787 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 3788 return; 3789 } 3790 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 3791 // printf("looping %d\n", loop); 3792 u_unescape(strlist[loop], str, 20); 3793 UnicodeString ustr(str); 3794 int forward[50]; 3795 int count = 0; 3796 3797 bi->setText(ustr); 3798 int prev = 0; 3799 int i; 3800 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) { 3801 forward[count ++] = i; 3802 if (i > prev) { 3803 int j; 3804 for (j = prev + 1; j < i; j ++) { 3805 if (bi->isBoundary(j)) { 3806 printStringBreaks(ustr, forward, count); 3807 errln("happy boundary test failed: expected %d not a boundary", 3808 j); 3809 return; 3810 } 3811 } 3812 } 3813 if (!bi->isBoundary(i)) { 3814 printStringBreaks(ustr, forward, count); 3815 errln("happy boundary test failed: expected %d a boundary", 3816 i); 3817 return; 3818 } 3819 prev = i; 3820 } 3821 } 3822 delete bi; 3823 } 3824 3825 void RBBITest::TestLineBreaks(void) 3826 { 3827 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 3828 Locale locale("en"); 3829 UErrorCode status = U_ZERO_ERROR; 3830 BreakIterator *bi = BreakIterator::createLineInstance(locale, status); 3831 const int32_t STRSIZE = 50; 3832 UChar str[STRSIZE]; 3833 static const char *strlist[] = 3834 { 3835 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc", 3836 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\" 3837 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d", 3838 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\" 3839 "u2014\\U000e0105\\u118c\\u000a\\u07f8", 3840 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f", 3841 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", 3842 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4", 3843 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", 3844 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060", 3845 "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5", 3846 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f", 3847 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1", 3848 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5", 3849 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0", 3850 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc", 3851 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f", 3852 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f", 3853 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b", 3854 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085", 3855 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac", 3856 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9", 3857 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025", 3858 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763", 3859 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029", 3860 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7", 3861 "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc", 3862 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a", 3863 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945", 3864 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014", 3865 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b", 3866 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0", 3867 "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025", 3868 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d", 3869 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111", 3870 "\\u2014\\u0020\\u000a\\u17c5\\u24fc", 3871 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f", 3872 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010", 3873 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43", 3874 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb", 3875 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc", 3876 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060", 3877 "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d" 3878 "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5" 3879 "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b", 3880 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0", 3881 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07", 3882 }; 3883 int loop; 3884 TEST_ASSERT_SUCCESS(status); 3885 if (U_FAILURE(status)) { 3886 return; 3887 } 3888 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 3889 // printf("looping %d\n", loop); 3890 int32_t t = u_unescape(strlist[loop], str, STRSIZE); 3891 if (t >= STRSIZE) { 3892 TEST_ASSERT(FALSE); 3893 continue; 3894 } 3895 3896 3897 UnicodeString ustr(str); 3898 RBBILineMonkey monkey; 3899 if (U_FAILURE(monkey.deferredStatus)) { 3900 continue; 3901 } 3902 3903 const int EXPECTEDSIZE = 50; 3904 int expected[EXPECTEDSIZE]; 3905 int expectedcount = 0; 3906 3907 monkey.setText(ustr); 3908 int i; 3909 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 3910 if (expectedcount >= EXPECTEDSIZE) { 3911 TEST_ASSERT(expectedcount < EXPECTEDSIZE); 3912 return; 3913 } 3914 expected[expectedcount ++] = i; 3915 } 3916 3917 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 3918 } 3919 delete bi; 3920 #endif 3921 } 3922 3923 void RBBITest::TestSentBreaks(void) 3924 { 3925 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 3926 Locale locale("en"); 3927 UErrorCode status = U_ZERO_ERROR; 3928 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status); 3929 UChar str[200]; 3930 static const char *strlist[] = 3931 { 3932 "Now\ris\nthe\r\ntime\n\rfor\r\r", 3933 "This\n", 3934 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.", 3935 "\"Sentence ending with a quote.\" Bye.", 3936 " (This is it). Testing the sentence iterator. \"This isn't it.\"", 3937 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"", 3938 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ", 3939 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ", 3940 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ", 3941 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!", 3942 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52" 3943 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a" 3944 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f" 3945 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352", 3946 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171" 3947 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030" 3948 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b" 3949 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b" 3950 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05" 3951 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4" 3952 }; 3953 int loop; 3954 if (U_FAILURE(status)) { 3955 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 3956 return; 3957 } 3958 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 3959 u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0]))); 3960 UnicodeString ustr(str); 3961 3962 RBBISentMonkey monkey; 3963 if (U_FAILURE(monkey.deferredStatus)) { 3964 continue; 3965 } 3966 3967 const int EXPECTEDSIZE = 50; 3968 int expected[EXPECTEDSIZE]; 3969 int expectedcount = 0; 3970 3971 monkey.setText(ustr); 3972 int i; 3973 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 3974 if (expectedcount >= EXPECTEDSIZE) { 3975 TEST_ASSERT(expectedcount < EXPECTEDSIZE); 3976 return; 3977 } 3978 expected[expectedcount ++] = i; 3979 } 3980 3981 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 3982 } 3983 delete bi; 3984 #endif 3985 } 3986 3987 void RBBITest::TestMonkey(char *params) { 3988 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 3989 3990 UErrorCode status = U_ZERO_ERROR; 3991 int32_t loopCount = 500; 3992 int32_t seed = 1; 3993 UnicodeString breakType = "all"; 3994 Locale locale("en"); 3995 UBool useUText = FALSE; 3996 3997 if (quick == FALSE) { 3998 loopCount = 10000; 3999 } 4000 4001 if (params) { 4002 UnicodeString p(params); 4003 loopCount = getIntParam("loop", p, loopCount); 4004 seed = getIntParam("seed", p, seed); 4005 4006 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status); 4007 if (m.find()) { 4008 breakType = m.group(1, status); 4009 m.reset(); 4010 p = m.replaceFirst("", status); 4011 } 4012 4013 RegexMatcher u(" *utext", p, 0, status); 4014 if (u.find()) { 4015 useUText = TRUE; 4016 u.reset(); 4017 p = u.replaceFirst("", status); 4018 } 4019 4020 4021 // m.reset(p); 4022 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) { 4023 // Each option is stripped out of the option string as it is processed. 4024 // All options have been checked. The option string should have been completely emptied.. 4025 char buf[100]; 4026 p.extract(buf, sizeof(buf), NULL, status); 4027 buf[sizeof(buf)-1] = 0; 4028 errln("Unrecognized or extra parameter: %s\n", buf); 4029 return; 4030 } 4031 4032 } 4033 4034 if (breakType == "char" || breakType == "all") { 4035 RBBICharMonkey m; 4036 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 4037 if (U_SUCCESS(status)) { 4038 RunMonkey(bi, m, "char", seed, loopCount, useUText); 4039 if (breakType == "all" && useUText==FALSE) { 4040 // Also run a quick test with UText when "all" is specified 4041 RunMonkey(bi, m, "char", seed, loopCount, TRUE); 4042 } 4043 } 4044 else { 4045 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status)); 4046 } 4047 delete bi; 4048 } 4049 4050 if (breakType == "word" || breakType == "all") { 4051 logln("Word Break Monkey Test"); 4052 RBBIWordMonkey m; 4053 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 4054 if (U_SUCCESS(status)) { 4055 RunMonkey(bi, m, "word", seed, loopCount, useUText); 4056 } 4057 else { 4058 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status)); 4059 } 4060 delete bi; 4061 } 4062 4063 if (breakType == "line" || breakType == "all") { 4064 logln("Line Break Monkey Test"); 4065 RBBILineMonkey m; 4066 BreakIterator *bi = BreakIterator::createLineInstance(locale, status); 4067 if (loopCount >= 10) { 4068 loopCount = loopCount / 5; // Line break runs slower than the others. 4069 } 4070 if (U_SUCCESS(status)) { 4071 RunMonkey(bi, m, "line", seed, loopCount, useUText); 4072 } 4073 else { 4074 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); 4075 } 4076 delete bi; 4077 } 4078 4079 if (breakType == "sent" || breakType == "all" ) { 4080 logln("Sentence Break Monkey Test"); 4081 RBBISentMonkey m; 4082 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status); 4083 if (loopCount >= 10) { 4084 loopCount = loopCount / 10; // Sentence runs slower than the other break types 4085 } 4086 if (U_SUCCESS(status)) { 4087 RunMonkey(bi, m, "sentence", seed, loopCount, useUText); 4088 } 4089 else { 4090 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); 4091 } 4092 delete bi; 4093 } 4094 4095 #endif 4096 } 4097 4098 // 4099 // Run a RBBI monkey test. Common routine, for all break iterator types. 4100 // Parameters: 4101 // bi - the break iterator to use 4102 // mk - MonkeyKind, abstraction for obtaining expected results 4103 // name - Name of test (char, word, etc.) for use in error messages 4104 // seed - Seed for starting random number generator (parameter from user) 4105 // numIterations 4106 // 4107 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed, 4108 int32_t numIterations, UBool useUText) { 4109 4110 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 4111 4112 const int32_t TESTSTRINGLEN = 500; 4113 UnicodeString testText; 4114 int32_t numCharClasses; 4115 UVector *chClasses; 4116 int expected[TESTSTRINGLEN*2 + 1]; 4117 int expectedCount = 0; 4118 char expectedBreaks[TESTSTRINGLEN*2 + 1]; 4119 char forwardBreaks[TESTSTRINGLEN*2 + 1]; 4120 char reverseBreaks[TESTSTRINGLEN*2+1]; 4121 char isBoundaryBreaks[TESTSTRINGLEN*2+1]; 4122 char followingBreaks[TESTSTRINGLEN*2+1]; 4123 char precedingBreaks[TESTSTRINGLEN*2+1]; 4124 int i; 4125 int loopCount = 0; 4126 4127 m_seed = seed; 4128 4129 numCharClasses = mk.charClasses()->size(); 4130 chClasses = mk.charClasses(); 4131 4132 // Check for errors that occured during the construction of the MonkeyKind object. 4133 // Can't report them where they occured because errln() is a method coming from intlTest, 4134 // and is not visible outside of RBBITest :-( 4135 if (U_FAILURE(mk.deferredStatus)) { 4136 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus)); 4137 return; 4138 } 4139 4140 // Verify that the character classes all have at least one member. 4141 for (i=0; i<numCharClasses; i++) { 4142 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i); 4143 if (s == NULL || s->size() == 0) { 4144 errln("Character Class #%d is null or of zero size.", i); 4145 return; 4146 } 4147 } 4148 4149 while (loopCount < numIterations || numIterations == -1) { 4150 if (numIterations == -1 && loopCount % 10 == 0) { 4151 // If test is running in an infinite loop, display a periodic tic so 4152 // we can tell that it is making progress. 4153 fprintf(stderr, "."); 4154 } 4155 // Save current random number seed, so that we can recreate the random numbers 4156 // for this loop iteration in event of an error. 4157 seed = m_seed; 4158 4159 // Populate a test string with data. 4160 testText.truncate(0); 4161 for (i=0; i<TESTSTRINGLEN; i++) { 4162 int32_t aClassNum = m_rand() % numCharClasses; 4163 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum); 4164 int32_t charIdx = m_rand() % classSet->size(); 4165 UChar32 c = classSet->charAt(charIdx); 4166 if (c < 0) { // TODO: deal with sets containing strings. 4167 errln("c < 0"); 4168 break; 4169 } 4170 testText.append(c); 4171 } 4172 4173 // Calculate the expected results for this test string. 4174 mk.setText(testText); 4175 memset(expectedBreaks, 0, sizeof(expectedBreaks)); 4176 expectedBreaks[0] = 1; 4177 int32_t breakPos = 0; 4178 expectedCount = 0; 4179 for (;;) { 4180 breakPos = mk.next(breakPos); 4181 if (breakPos == -1) { 4182 break; 4183 } 4184 if (breakPos > testText.length()) { 4185 errln("breakPos > testText.length()"); 4186 } 4187 expectedBreaks[breakPos] = 1; 4188 U_ASSERT(expectedCount<testText.length()); 4189 expected[expectedCount ++] = breakPos; 4190 (void)expected; // Set but not used warning. 4191 // TODO (andy): check it out. 4192 } 4193 4194 // Find the break positions using forward iteration 4195 memset(forwardBreaks, 0, sizeof(forwardBreaks)); 4196 if (useUText) { 4197 UErrorCode status = U_ZERO_ERROR; 4198 UText *testUText = utext_openReplaceable(NULL, &testText, &status); 4199 // testUText = utext_openUnicodeString(testUText, &testText, &status); 4200 bi->setText(testUText, status); 4201 TEST_ASSERT_SUCCESS(status); 4202 utext_close(testUText); // The break iterator does a shallow clone of the UText 4203 // This UText can be closed immediately, so long as the 4204 // testText string continues to exist. 4205 } else { 4206 bi->setText(testText); 4207 } 4208 4209 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) { 4210 if (i < 0 || i > testText.length()) { 4211 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name); 4212 break; 4213 } 4214 forwardBreaks[i] = 1; 4215 } 4216 4217 // Find the break positions using reverse iteration 4218 memset(reverseBreaks, 0, sizeof(reverseBreaks)); 4219 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) { 4220 if (i < 0 || i > testText.length()) { 4221 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name); 4222 break; 4223 } 4224 reverseBreaks[i] = 1; 4225 } 4226 4227 // Find the break positions using isBoundary() tests. 4228 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks)); 4229 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length()); 4230 for (i=0; i<=testText.length(); i++) { 4231 isBoundaryBreaks[i] = bi->isBoundary(i); 4232 } 4233 4234 4235 // Find the break positions using the following() function. 4236 // printf("."); 4237 memset(followingBreaks, 0, sizeof(followingBreaks)); 4238 int32_t lastBreakPos = 0; 4239 followingBreaks[0] = 1; 4240 for (i=0; i<testText.length(); i++) { 4241 breakPos = bi->following(i); 4242 if (breakPos <= i || 4243 breakPos < lastBreakPos || 4244 breakPos > testText.length() || 4245 (breakPos > lastBreakPos && lastBreakPos > i)) { 4246 errln("%s break monkey test: " 4247 "Out of range value returned by BreakIterator::following().\n" 4248 "Random seed=%d index=%d; following returned %d; lastbreak=%d", 4249 name, seed, i, breakPos, lastBreakPos); 4250 break; 4251 } 4252 followingBreaks[breakPos] = 1; 4253 lastBreakPos = breakPos; 4254 } 4255 4256 // Find the break positions using the preceding() function. 4257 memset(precedingBreaks, 0, sizeof(precedingBreaks)); 4258 lastBreakPos = testText.length(); 4259 precedingBreaks[testText.length()] = 1; 4260 for (i=testText.length(); i>0; i--) { 4261 breakPos = bi->preceding(i); 4262 if (breakPos >= i || 4263 breakPos > lastBreakPos || 4264 (breakPos < 0 && testText.getChar32Start(i)>0) || 4265 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) { 4266 errln("%s break monkey test: " 4267 "Out of range value returned by BreakIterator::preceding().\n" 4268 "index=%d; prev returned %d; lastBreak=%d" , 4269 name, i, breakPos, lastBreakPos); 4270 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) { 4271 precedingBreaks[i] = 2; // Forces an error. 4272 } 4273 } else { 4274 if (breakPos >= 0) { 4275 precedingBreaks[breakPos] = 1; 4276 } 4277 lastBreakPos = breakPos; 4278 } 4279 } 4280 4281 // Compare the expected and actual results. 4282 for (i=0; i<=testText.length(); i++) { 4283 const char *errorType = NULL; 4284 if (forwardBreaks[i] != expectedBreaks[i]) { 4285 errorType = "next()"; 4286 } else if (reverseBreaks[i] != forwardBreaks[i]) { 4287 errorType = "previous()"; 4288 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) { 4289 errorType = "isBoundary()"; 4290 } else if (followingBreaks[i] != expectedBreaks[i]) { 4291 errorType = "following()"; 4292 } else if (precedingBreaks[i] != expectedBreaks[i]) { 4293 errorType = "preceding()"; 4294 } 4295 4296 4297 if (errorType != NULL) { 4298 // Format a range of the test text that includes the failure as 4299 // a data item that can be included in the rbbi test data file. 4300 4301 // Start of the range is the last point where expected and actual results 4302 // both agreed that there was a break position. 4303 int startContext = i; 4304 int32_t count = 0; 4305 for (;;) { 4306 if (startContext==0) { break; } 4307 startContext --; 4308 if (expectedBreaks[startContext] != 0) { 4309 if (count == 2) break; 4310 count ++; 4311 } 4312 } 4313 4314 // End of range is two expected breaks past the start position. 4315 int endContext = i + 1; 4316 int ci; 4317 for (ci=0; ci<2; ci++) { // Number of items to include in error text. 4318 for (;;) { 4319 if (endContext >= testText.length()) {break;} 4320 if (expectedBreaks[endContext-1] != 0) { 4321 if (count == 0) break; 4322 count --; 4323 } 4324 endContext ++; 4325 } 4326 } 4327 4328 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>" 4329 UnicodeString errorText = "<data>"; 4330 /***if (strcmp(errorType, "next()") == 0) { 4331 startContext = 0; 4332 endContext = testText.length(); 4333 4334 printStringBreaks(testText, expected, expectedCount); 4335 }***/ 4336 4337 for (ci=startContext; ci<endContext;) { 4338 UnicodeString hexChars("0123456789abcdef"); 4339 UChar32 c; 4340 int bn; 4341 c = testText.char32At(ci); 4342 if (ci == i) { 4343 // This is the location of the error. 4344 errorText.append("<?>"); 4345 } else if (expectedBreaks[ci] != 0) { 4346 // This a non-error expected break position. 4347 errorText.append("\\"); 4348 } 4349 if (c < 0x10000) { 4350 errorText.append("\\u"); 4351 for (bn=12; bn>=0; bn-=4) { 4352 errorText.append(hexChars.charAt((c>>bn)&0xf)); 4353 } 4354 } else { 4355 errorText.append("\\U"); 4356 for (bn=28; bn>=0; bn-=4) { 4357 errorText.append(hexChars.charAt((c>>bn)&0xf)); 4358 } 4359 } 4360 ci = testText.moveIndex32(ci, 1); 4361 } 4362 errorText.append("\\"); 4363 errorText.append("</data>\n"); 4364 4365 // Output the error 4366 char charErrorTxt[500]; 4367 UErrorCode status = U_ZERO_ERROR; 4368 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status); 4369 charErrorTxt[sizeof(charErrorTxt)-1] = 0; 4370 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status); 4371 4372 errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s", 4373 name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"), 4374 errorType, seed, i, charErrorTxt); 4375 break; 4376 } 4377 } 4378 4379 loopCount++; 4380 } 4381 #endif 4382 } 4383 4384 4385 // Bug 5532. UTF-8 based UText fails in dictionary code. 4386 // This test checks the initial patch, 4387 // which is to just keep it from crashing. Correct word boundaries 4388 // await a proper fix to the dictionary code. 4389 // 4390 void RBBITest::TestBug5532(void) { 4391 // Text includes a mixture of Thai and Latin. 4392 const unsigned char utf8Data[] = { 4393 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u, 4394 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u, 4395 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u, 4396 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 4397 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u, 4398 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u, 4399 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu, 4400 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u, 4401 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 4402 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u, 4403 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00}; 4404 4405 UErrorCode status = U_ZERO_ERROR; 4406 UText utext=UTEXT_INITIALIZER; 4407 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status); 4408 TEST_ASSERT_SUCCESS(status); 4409 4410 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status); 4411 TEST_ASSERT_SUCCESS(status); 4412 if (U_SUCCESS(status)) { 4413 bi->setText(&utext, status); 4414 TEST_ASSERT_SUCCESS(status); 4415 4416 int32_t breakCount = 0; 4417 int32_t previousBreak = -1; 4418 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) { 4419 // For now, just make sure that the break iterator doesn't hang. 4420 TEST_ASSERT(previousBreak < bi->current()); 4421 previousBreak = bi->current(); 4422 } 4423 TEST_ASSERT(breakCount > 0); 4424 } 4425 delete bi; 4426 utext_close(&utext); 4427 } 4428 4429 4430 void RBBITest::TestBug9983(void) { 4431 UnicodeString text = UnicodeString("\\u002A" // * Other 4432 "\\uFF65" // Other 4433 "\\u309C" // Katakana 4434 "\\uFF9F" // Extend 4435 "\\uFF65" // Other 4436 "\\u0020" // Other 4437 "\\u0000").unescape(); 4438 4439 UErrorCode status = U_ZERO_ERROR; 4440 LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>( 4441 BreakIterator::createWordInstance(Locale::getRoot(), status))); 4442 TEST_ASSERT_SUCCESS(status); 4443 LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>( 4444 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status))); 4445 TEST_ASSERT_SUCCESS(status); 4446 if (U_FAILURE(status)) { 4447 return; 4448 } 4449 int32_t offset, rstatus, iterationCount; 4450 4451 brkiter->setText(text); 4452 brkiter->last(); 4453 iterationCount = 0; 4454 while ( (offset = brkiter->previous()) != UBRK_DONE ) { 4455 iterationCount++; 4456 rstatus = brkiter->getRuleStatus(); 4457 (void)rstatus; // Suppress set but not used warning. 4458 if (iterationCount >= 10) { 4459 break; 4460 } 4461 } 4462 TEST_ASSERT(iterationCount == 6); 4463 4464 brkiterPOSIX->setText(text); 4465 brkiterPOSIX->last(); 4466 iterationCount = 0; 4467 while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) { 4468 iterationCount++; 4469 rstatus = brkiterPOSIX->getRuleStatus(); 4470 (void)rstatus; // Suppress set but not used warning. 4471 if (iterationCount >= 10) { 4472 break; 4473 } 4474 } 4475 TEST_ASSERT(iterationCount == 6); 4476 } 4477 4478 4479 // 4480 // TestDebug - A place-holder test for debugging purposes. 4481 // For putting in fragments of other tests that can be invoked 4482 // for tracing without a lot of unwanted extra stuff happening. 4483 // 4484 void RBBITest::TestDebug(void) { 4485 #if 0 4486 UErrorCode status = U_ZERO_ERROR; 4487 int pos = 0; 4488 int ruleStatus = 0; 4489 4490 RuleBasedBreakIterator* bi = 4491 // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); 4492 // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status); 4493 (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status); 4494 UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e"); 4495 // UnicodeString s("Aaa. Bcd"); 4496 s = s.unescape(); 4497 bi->setText(s); 4498 UBool r = bi->isBoundary(8); 4499 printf("%s", r?"true":"false"); 4500 return; 4501 pos = bi->last(); 4502 do { 4503 // ruleStatus = bi->getRuleStatus(); 4504 printf("%d\t%d\n", pos, ruleStatus); 4505 pos = bi->previous(); 4506 } while (pos != BreakIterator::DONE); 4507 #endif 4508 } 4509 4510 void RBBITest::TestProperties() { 4511 UErrorCode errorCode = U_ZERO_ERROR; 4512 UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode); 4513 if (!prependSet.isEmpty()) { 4514 errln( 4515 "[:GCB=Prepend:] is not empty any more. " 4516 "Uncomment relevant lines in source/data/brkitr/char.txt and " 4517 "change this test to the opposite condition."); 4518 } 4519 } 4520 4521 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 4522