1 /******************************************************************** 2 * COPYRIGHT: 3 * Copyright (c) 1999-2015, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ********************************************************************/ 6 /************************************************************************ 7 * Date Name Description 8 * 12/15/99 Madhu Creation. 9 * 01/12/2000 Madhu Updated for changed API and added new tests 10 ************************************************************************/ 11 12 #include "utypeinfo.h" // for 'typeid' to work 13 14 #include "unicode/utypes.h" 15 16 #if !UCONFIG_NO_BREAK_ITERATION 17 18 #include "unicode/utypes.h" 19 #include "unicode/brkiter.h" 20 #include "unicode/rbbi.h" 21 #include "unicode/uchar.h" 22 #include "unicode/utf16.h" 23 #include "unicode/ucnv.h" 24 #include "unicode/schriter.h" 25 #include "unicode/uniset.h" 26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 27 #include "unicode/regex.h" 28 #endif 29 #include "unicode/ustring.h" 30 #include "unicode/utext.h" 31 #include "intltest.h" 32 #include "rbbitst.h" 33 #include <string.h> 34 #include "charstr.h" 35 #include "uvector.h" 36 #include "uvectr32.h" 37 #include <stdio.h> 38 #include <stdlib.h> 39 #include "unicode/numfmt.h" 40 #include "unicode/uscript.h" 41 #include "cmemory.h" 42 43 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION 44 #include "unicode/filteredbrk.h" 45 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION 46 47 #define TEST_ASSERT(x) {if (!(x)) { \ 48 errln("Failure in file %s, line %d", __FILE__, __LINE__);}} 49 50 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \ 51 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}} 52 53 54 //--------------------------------------------- 55 // runIndexedTest 56 //--------------------------------------------- 57 58 59 // Note: Before adding new tests to this file, check whether the desired test data can 60 // simply be added to the file testdata/rbbitest.txt. In most cases it can, 61 // it's much less work than writing a new test, diagnostic output in the event of failures 62 // is good, and the test data file will is shared with ICU4J, so eventually the test 63 // will run there as well, without additional effort. 64 65 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params ) 66 { 67 if (exec) logln("TestSuite RuleBasedBreakIterator: "); 68 69 switch (index) { 70 #if !UCONFIG_NO_FILE_IO 71 case 0: name = "TestBug4153072"; 72 if(exec) TestBug4153072(); break; 73 #else 74 case 0: name = "skip"; 75 break; 76 #endif 77 78 case 1: name = "skip"; 79 break; 80 case 2: name = "TestStatusReturn"; 81 if(exec) TestStatusReturn(); break; 82 83 #if !UCONFIG_NO_FILE_IO 84 case 3: name = "TestUnicodeFiles"; 85 if(exec) TestUnicodeFiles(); break; 86 case 4: name = "TestEmptyString"; 87 if(exec) TestEmptyString(); break; 88 #else 89 case 3: case 4: name = "skip"; 90 break; 91 #endif 92 93 case 5: name = "TestGetAvailableLocales"; 94 if(exec) TestGetAvailableLocales(); break; 95 96 case 6: name = "TestGetDisplayName"; 97 if(exec) TestGetDisplayName(); break; 98 99 #if !UCONFIG_NO_FILE_IO 100 case 7: name = "TestEndBehaviour"; 101 if(exec) TestEndBehaviour(); break; 102 case 8: case 9: case 10: name = "skip"; 103 break; 104 case 11: name = "TestWordBreaks"; 105 if(exec) TestWordBreaks(); break; 106 case 12: name = "TestWordBoundary"; 107 if(exec) TestWordBoundary(); break; 108 case 13: name = "TestLineBreaks"; 109 if(exec) TestLineBreaks(); break; 110 case 14: name = "TestSentBreaks"; 111 if(exec) TestSentBreaks(); break; 112 case 15: name = "TestExtended"; 113 if(exec) TestExtended(); break; 114 #else 115 case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip"; 116 break; 117 #endif 118 119 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO 120 case 16: 121 name = "TestMonkey"; if(exec) TestMonkey(params); break; 122 #else 123 case 16: 124 name = "skip"; break; 125 #endif 126 127 #if !UCONFIG_NO_FILE_IO 128 case 17: name = "TestBug3818"; 129 if(exec) TestBug3818(); break; 130 #else 131 case 17: name = "skip"; 132 break; 133 #endif 134 135 case 18: name = "skip"; 136 break; 137 case 19: name = "TestDebug"; 138 if(exec) TestDebug(); break; 139 case 20: name = "skip"; 140 break; 141 142 #if !UCONFIG_NO_FILE_IO 143 case 21: name = "TestBug5775"; 144 if (exec) TestBug5775(); break; 145 #else 146 case 21: name = "skip"; 147 break; 148 #endif 149 150 case 22: name = "TestBug9983"; 151 if (exec) TestBug9983(); break; 152 case 23: name = "TestDictRules"; 153 if (exec) TestDictRules(); break; 154 case 24: name = "TestBug5532"; 155 if (exec) TestBug5532(); break; 156 default: name = ""; break; //needed to end loop 157 } 158 } 159 160 161 //--------------------------------------------------------------------------- 162 // 163 // class BITestData Holds a set of Break iterator test data and results 164 // Includes 165 // - the string data to be broken 166 // - a vector of the expected break positions. 167 // - a vector of source line numbers for the data, 168 // (to help see where errors occured.) 169 // - The expected break tag values. 170 // - Vectors of actual break positions and tag values. 171 // - Functions for comparing actual with expected and 172 // reporting errors. 173 // 174 //---------------------------------------------------------------------------- 175 class BITestData { 176 public: 177 UnicodeString fDataToBreak; 178 UVector fExpectedBreakPositions; 179 UVector fExpectedTags; 180 UVector fLineNum; 181 UVector fActualBreakPositions; // Test Results. 182 UVector fActualTags; 183 184 BITestData(UErrorCode &status); 185 void addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status); 186 void checkResults(const char *heading, RBBITest *test); 187 void err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx); 188 void clearResults(); 189 }; 190 191 // 192 // Constructor. 193 // 194 BITestData::BITestData(UErrorCode &status) 195 : fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fActualBreakPositions(status), 196 fActualTags(status) 197 { 198 } 199 200 // 201 // addDataChunk. Add a section (non-breaking) piece if data to the test data. 202 // The macro form collects the line number, which is helpful 203 // when tracking down failures. 204 // 205 // A null data item is inserted at the start of each test's data 206 // to put the starting zero into the data list. The position saved for 207 // each non-null item is its ending position. 208 // 209 #define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status); 210 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) { 211 if (U_FAILURE(status)) {return;} 212 if (data != NULL) { 213 fDataToBreak.append(CharsToUnicodeString(data)); 214 } 215 fExpectedBreakPositions.addElement(fDataToBreak.length(), status); 216 fExpectedTags.addElement(tag, status); 217 fLineNum.addElement(lineNum, status); 218 } 219 220 221 // 222 // checkResults. Compare the actual and expected break positions, report any differences. 223 // 224 void BITestData::checkResults(const char *heading, RBBITest *test) { 225 int32_t expectedIndex = 0; 226 int32_t actualIndex = 0; 227 228 for (;;) { 229 // If we've run through both the expected and actual results vectors, we're done. 230 // break out of the loop. 231 if (expectedIndex >= fExpectedBreakPositions.size() && 232 actualIndex >= fActualBreakPositions.size()) { 233 break; 234 } 235 236 237 if (expectedIndex >= fExpectedBreakPositions.size()) { 238 err(heading, test, expectedIndex-1, actualIndex); 239 actualIndex++; 240 continue; 241 } 242 243 if (actualIndex >= fActualBreakPositions.size()) { 244 err(heading, test, expectedIndex, actualIndex-1); 245 expectedIndex++; 246 continue; 247 } 248 249 if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) { 250 err(heading, test, expectedIndex, actualIndex); 251 // Try to resync the positions of the indices, to avoid a rash of spurious erros. 252 if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) { 253 actualIndex++; 254 } else { 255 expectedIndex++; 256 } 257 continue; 258 } 259 260 if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) { 261 test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d", 262 heading, fLineNum.elementAt(expectedIndex), 263 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex)); 264 } 265 266 actualIndex++; 267 expectedIndex++; 268 } 269 } 270 271 // 272 // err - An error was found. Report it, along with information about where the 273 // incorrectly broken test data appeared in the source file. 274 // 275 void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx) 276 { 277 int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx); 278 int32_t actual = fActualBreakPositions.elementAti(actualIdx); 279 int32_t o = 0; 280 int32_t line = fLineNum.elementAti(expectedIdx); 281 if (expectedIdx > 0) { 282 // The line numbers are off by one because a premature break occurs somewhere 283 // within the previous item, rather than at the start of the current (expected) item. 284 // We want to report the offset of the unexpected break from the start of 285 // this previous item. 286 o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1); 287 } 288 if (actual < expected) { 289 test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading, o, line, actual, expected); 290 } else { 291 test->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading, line, actual, expected); 292 } 293 } 294 295 296 void BITestData::clearResults() { 297 fActualBreakPositions.removeAllElements(); 298 fActualTags.removeAllElements(); 299 } 300 301 302 //-------------------------------------------------------------------------------------- 303 // 304 // RBBITest constructor and destructor 305 // 306 //-------------------------------------------------------------------------------------- 307 308 RBBITest::RBBITest() { 309 } 310 311 312 RBBITest::~RBBITest() { 313 } 314 315 //----------------------------------------------------------------------------------- 316 // 317 // Test for status {tag} return value from break rules. 318 // TODO: a more thorough test. 319 // 320 //----------------------------------------------------------------------------------- 321 void RBBITest::TestStatusReturn() { 322 UnicodeString rulesString1("$Letters = [:L:];\n" 323 "$Numbers = [:N:];\n" 324 "$Letters+{1};\n" 325 "$Numbers+{2};\n" 326 "Help\\ {4}/me\\!;\n" 327 "[^$Letters $Numbers];\n" 328 "!.*;\n", -1, US_INV); 329 UnicodeString testString1 = "abc123..abc Help me Help me!"; 330 // 01234567890123456789012345678 331 int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1}; 332 int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1}; 333 334 UErrorCode status=U_ZERO_ERROR; 335 UParseError parseError; 336 337 BreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status); 338 if(U_FAILURE(status)) { 339 dataerrln("FAIL : in construction - %s", u_errorName(status)); 340 } else { 341 int32_t pos; 342 int32_t i = 0; 343 bi->setText(testString1); 344 for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) { 345 if (pos != bounds1[i]) { 346 errln("FAIL: expected break at %d, got %d\n", bounds1[i], pos); 347 break; 348 } 349 350 int tag = bi->getRuleStatus(); 351 if (tag != brkStatus[i]) { 352 errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag); 353 break; 354 } 355 i++; 356 } 357 } 358 delete bi; 359 } 360 361 362 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) { 363 UErrorCode status = U_ZERO_ERROR; 364 char name[100]; 365 printf("code alpha extend alphanum type word sent line name\n"); 366 int nextExpectedIndex = 0; 367 utext_setNativeIndex(tstr, 0); 368 for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) { 369 if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) { 370 printf("------------------------------------------------ %d\n", j); 371 ++nextExpectedIndex; 372 } 373 374 UChar32 c = utext_next32(tstr); 375 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status); 376 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c, 377 u_isUAlphabetic(c), 378 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND), 379 u_isalnum(c), 380 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY, 381 u_charType(c), 382 U_SHORT_PROPERTY_NAME), 383 u_getPropertyValueName(UCHAR_WORD_BREAK, 384 u_getIntPropertyValue(c, 385 UCHAR_WORD_BREAK), 386 U_SHORT_PROPERTY_NAME), 387 u_getPropertyValueName(UCHAR_SENTENCE_BREAK, 388 u_getIntPropertyValue(c, 389 UCHAR_SENTENCE_BREAK), 390 U_SHORT_PROPERTY_NAME), 391 u_getPropertyValueName(UCHAR_LINE_BREAK, 392 u_getIntPropertyValue(c, 393 UCHAR_LINE_BREAK), 394 U_SHORT_PROPERTY_NAME), 395 name); 396 } 397 } 398 399 400 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) { 401 UErrorCode status = U_ZERO_ERROR; 402 UText *tstr = NULL; 403 tstr = utext_openConstUnicodeString(NULL, &ustr, &status); 404 if (U_FAILURE(status)) { 405 printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status)); 406 return; 407 } 408 printStringBreaks(tstr, expected, expectedCount); 409 utext_close(tstr); 410 } 411 412 413 void RBBITest::TestBug3818() { 414 UErrorCode status = U_ZERO_ERROR; 415 416 // Four Thai words... 417 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 418 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 }; 419 UnicodeString thaiStr(thaiWordData); 420 421 BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status); 422 if (U_FAILURE(status) || bi == NULL) { 423 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status)); 424 return; 425 } 426 bi->setText(thaiStr); 427 428 int32_t startOfSecondWord = bi->following(1); 429 if (startOfSecondWord != 4) { 430 errln("Fail at file %s, line %d expected start of word at 4, got %d", 431 __FILE__, __LINE__, startOfSecondWord); 432 } 433 startOfSecondWord = bi->following(0); 434 if (startOfSecondWord != 4) { 435 errln("Fail at file %s, line %d expected start of word at 4, got %d", 436 __FILE__, __LINE__, startOfSecondWord); 437 } 438 delete bi; 439 } 440 441 //---------------------------------------------------------------------------- 442 // 443 // generalIteratorTest Given a break iterator and a set of test data, 444 // Run the tests and report the results. 445 // 446 //---------------------------------------------------------------------------- 447 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td) 448 { 449 450 bi.setText(td.fDataToBreak); 451 452 testFirstAndNext(bi, td); 453 454 testLastAndPrevious(bi, td); 455 456 testFollowing(bi, td); 457 testPreceding(bi, td); 458 testIsBoundary(bi, td); 459 doMultipleSelectionTest(bi, td); 460 } 461 462 463 // 464 // testFirstAndNext. Run the iterator forwards in the obvious first(), next() 465 // kind of loop. 466 // 467 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td) 468 { 469 UErrorCode status = U_ZERO_ERROR; 470 int32_t p; 471 int32_t lastP = -1; 472 int32_t tag; 473 474 logln("Test first and next"); 475 bi.setText(td.fDataToBreak); 476 td.clearResults(); 477 478 for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) { 479 td.fActualBreakPositions.addElement(p, status); // Save result. 480 tag = bi.getRuleStatus(); 481 td.fActualTags.addElement(tag, status); 482 if (p <= lastP) { 483 // If the iterator is not making forward progress, stop. 484 // No need to raise an error here, it'll be detected in the normal check of results. 485 break; 486 } 487 lastP = p; 488 } 489 td.checkResults("testFirstAndNext", this); 490 } 491 492 493 // 494 // TestLastAndPrevious. Run the iterator backwards, starting with last(). 495 // 496 void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td) 497 { 498 UErrorCode status = U_ZERO_ERROR; 499 int32_t p; 500 int32_t lastP = 0x7ffffffe; 501 int32_t tag; 502 503 logln("Test last and previous"); 504 bi.setText(td.fDataToBreak); 505 td.clearResults(); 506 507 for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) { 508 // Save break position. Insert it at start of vector of results, shoving 509 // already-saved results further towards the end. 510 td.fActualBreakPositions.insertElementAt(p, 0, status); 511 // bi.previous(); // TODO: Why does this fix things up???? 512 // bi.next(); 513 tag = bi.getRuleStatus(); 514 td.fActualTags.insertElementAt(tag, 0, status); 515 if (p >= lastP) { 516 // If the iterator is not making progress, stop. 517 // No need to raise an error here, it'll be detected in the normal check of results. 518 break; 519 } 520 lastP = p; 521 } 522 td.checkResults("testLastAndPrevious", this); 523 } 524 525 526 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td) 527 { 528 UErrorCode status = U_ZERO_ERROR; 529 int32_t p; 530 int32_t tag; 531 int32_t lastP = -2; // A value that will never be returned as a break position. 532 // cannot be -1; that is returned for DONE. 533 int i; 534 535 logln("testFollowing():"); 536 bi.setText(td.fDataToBreak); 537 td.clearResults(); 538 539 // Save the starting point, since we won't get that out of following. 540 p = bi.first(); 541 td.fActualBreakPositions.addElement(p, status); // Save result. 542 tag = bi.getRuleStatus(); 543 td.fActualTags.addElement(tag, status); 544 545 for (i = 0; i <= td.fDataToBreak.length()+1; i++) { 546 p = bi.following(i); 547 if (p != lastP) { 548 if (p == RuleBasedBreakIterator::DONE) { 549 break; 550 } 551 // We've reached a new break position. Save it. 552 td.fActualBreakPositions.addElement(p, status); // Save result. 553 tag = bi.getRuleStatus(); 554 td.fActualTags.addElement(tag, status); 555 lastP = p; 556 } 557 } 558 // The loop normally exits by means of the break in the middle. 559 // Make sure that the index was at the correct position for the break iterator to have 560 // returned DONE. 561 if (i != td.fDataToBreak.length()) { 562 errln("testFollowing(): iterator returned DONE prematurely."); 563 } 564 565 // Full check of all results. 566 td.checkResults("testFollowing", this); 567 } 568 569 570 571 void RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) { 572 UErrorCode status = U_ZERO_ERROR; 573 int32_t p; 574 int32_t tag; 575 int32_t lastP = 0x7ffffffe; 576 int i; 577 578 logln("testPreceding():"); 579 bi.setText(td.fDataToBreak); 580 td.clearResults(); 581 582 p = bi.last(); 583 td.fActualBreakPositions.addElement(p, status); 584 tag = bi.getRuleStatus(); 585 td.fActualTags.addElement(tag, status); 586 587 for (i = td.fDataToBreak.length(); i>=-1; i--) { 588 p = bi.preceding(i); 589 if (p != lastP) { 590 if (p == RuleBasedBreakIterator::DONE) { 591 break; 592 } 593 // We've reached a new break position. Save it. 594 td.fActualBreakPositions.insertElementAt(p, 0, status); 595 lastP = p; 596 tag = bi.getRuleStatus(); 597 td.fActualTags.insertElementAt(tag, 0, status); 598 } 599 } 600 // The loop normally exits by means of the break in the middle. 601 // Make sure that the index was at the correct position for the break iterator to have 602 // returned DONE. 603 if (i != 0) { 604 errln("testPreceding(): iterator returned DONE prematurely."); 605 } 606 607 // Full check of all results. 608 td.checkResults("testPreceding", this); 609 } 610 611 612 613 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) { 614 UErrorCode status = U_ZERO_ERROR; 615 int i; 616 int32_t tag; 617 618 logln("testIsBoundary():"); 619 bi.setText(td.fDataToBreak); 620 td.clearResults(); 621 622 for (i = 0; i <= td.fDataToBreak.length(); i++) { 623 if (bi.isBoundary(i)) { 624 td.fActualBreakPositions.addElement(i, status); // Save result. 625 tag = bi.getRuleStatus(); 626 td.fActualTags.addElement(tag, status); 627 } 628 } 629 td.checkResults("testIsBoundary: ", this); 630 } 631 632 633 634 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td) 635 { 636 iterator.setText(td.fDataToBreak); 637 638 RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone(); 639 int32_t offset = iterator.first(); 640 int32_t testOffset; 641 int32_t count = 0; 642 643 logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length()); 644 645 if (*testIterator != iterator) 646 errln("clone() or operator!= failed: two clones compared unequal"); 647 648 do { 649 testOffset = testIterator->first(); 650 testOffset = testIterator->next(count); 651 if (offset != testOffset) 652 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset); 653 654 if (offset != RuleBasedBreakIterator::DONE) { 655 count++; 656 offset = iterator.next(); 657 658 if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) { 659 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset); 660 if (count > 10000 || offset == -1) { 661 errln("operator== failed too many times. Stopping test."); 662 if (offset == -1) { 663 errln("Does (RuleBasedBreakIterator::DONE == -1)?"); 664 } 665 return; 666 } 667 } 668 } 669 } while (offset != RuleBasedBreakIterator::DONE); 670 671 // now do it backwards... 672 offset = iterator.last(); 673 count = 0; 674 675 do { 676 testOffset = testIterator->last(); 677 testOffset = testIterator->next(count); // next() with a negative arg is same as previous 678 if (offset != testOffset) 679 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset); 680 681 if (offset != RuleBasedBreakIterator::DONE) { 682 count--; 683 offset = iterator.previous(); 684 } 685 } while (offset != RuleBasedBreakIterator::DONE); 686 687 delete testIterator; 688 } 689 690 691 //--------------------------------------------- 692 // 693 // other tests 694 // 695 //--------------------------------------------- 696 void RBBITest::TestEmptyString() 697 { 698 UnicodeString text = ""; 699 UErrorCode status = U_ZERO_ERROR; 700 701 BITestData x(status); 702 ADD_DATACHUNK(x, "", 0, status); // Break at start of data 703 RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); 704 if (U_FAILURE(status)) 705 { 706 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status)); 707 return; 708 } 709 generalIteratorTest(*bi, x); 710 delete bi; 711 } 712 713 void RBBITest::TestGetAvailableLocales() 714 { 715 int32_t locCount = 0; 716 const Locale* locList = BreakIterator::getAvailableLocales(locCount); 717 718 if (locCount == 0) 719 dataerrln("getAvailableLocales() returned an empty list!"); 720 // Just make sure that it's returning good memory. 721 int32_t i; 722 for (i = 0; i < locCount; ++i) { 723 logln(locList[i].getName()); 724 } 725 } 726 727 //Testing the BreakIterator::getDisplayName() function 728 void RBBITest::TestGetDisplayName() 729 { 730 UnicodeString result; 731 732 BreakIterator::getDisplayName(Locale::getUS(), result); 733 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)") 734 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \"" 735 + result); 736 737 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result); 738 if (result != "French (France)") 739 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \"" 740 + result); 741 } 742 /** 743 * Test End Behaviour 744 * @bug 4068137 745 */ 746 void RBBITest::TestEndBehaviour() 747 { 748 UErrorCode status = U_ZERO_ERROR; 749 UnicodeString testString("boo."); 750 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status); 751 if (U_FAILURE(status)) 752 { 753 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status)); 754 return; 755 } 756 wb->setText(testString); 757 758 if (wb->first() != 0) 759 errln("Didn't get break at beginning of string."); 760 if (wb->next() != 3) 761 errln("Didn't get break before period in \"boo.\""); 762 if (wb->current() != 4 && wb->next() != 4) 763 errln("Didn't get break at end of string."); 764 delete wb; 765 } 766 /* 767 * @bug 4153072 768 */ 769 void RBBITest::TestBug4153072() { 770 UErrorCode status = U_ZERO_ERROR; 771 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status); 772 if (U_FAILURE(status)) 773 { 774 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status)); 775 return; 776 } 777 UnicodeString str("...Hello, World!..."); 778 int32_t begin = 3; 779 int32_t end = str.length() - 3; 780 UBool onBoundary; 781 782 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin); 783 iter->adoptText(textIterator); 784 int index; 785 // Note: with the switch to UText, there is no way to restrict the 786 // iteration range to begin at an index other than zero. 787 // String character iterators created with a non-zero bound are 788 // treated by RBBI as being empty. 789 for (index = -1; index < begin + 1; ++index) { 790 onBoundary = iter->isBoundary(index); 791 if (index == 0? !onBoundary : onBoundary) { 792 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index + 793 " and begin index = " + begin); 794 } 795 } 796 delete iter; 797 } 798 799 800 // 801 // Test for problem reported by Ashok Matoria on 9 July 2007 802 // One.<kSoftHyphen><kSpace>Two. 803 // 804 // Sentence break at start (0) and then on calling next() it breaks at 805 // 'T' of "Two". Now, at this point if I do next() and 806 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two". 807 // 808 void RBBITest::TestBug5775() { 809 UErrorCode status = U_ZERO_ERROR; 810 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status); 811 TEST_ASSERT_SUCCESS(status); 812 if (U_FAILURE(status)) { 813 return; 814 } 815 // Check for status first for better handling of no data errors. 816 TEST_ASSERT(bi != NULL); 817 if (bi == NULL) { 818 return; 819 } 820 821 UnicodeString s("One.\\u00ad Two.", -1, US_INV); 822 // 01234 56789 823 s = s.unescape(); 824 bi->setText(s); 825 int pos = bi->next(); 826 TEST_ASSERT(pos == 6); 827 pos = bi->next(); 828 TEST_ASSERT(pos == 10); 829 pos = bi->previous(); 830 TEST_ASSERT(pos == 6); 831 delete bi; 832 } 833 834 835 836 //------------------------------------------------------------------------------ 837 // 838 // RBBITest::Extended Run RBBI Tests from an external test data file 839 // 840 //------------------------------------------------------------------------------ 841 842 struct TestParams { 843 BreakIterator *bi; // Break iterator is set while parsing test source. 844 // Changed out whenever test data changes break type. 845 846 UnicodeString dataToBreak; // Data that is built up while parsing the test. 847 UVector32 *expectedBreaks; // Expected break positions, matches dataToBreak UnicodeString. 848 UVector32 *srcLine; // Positions in source file, indexed same as dataToBreak. 849 UVector32 *srcCol; 850 851 UText *textToBreak; // UText, could be UTF8 or UTF16. 852 UVector32 *textMap; // Map from UTF-16 dataToBreak offsets to UText offsets. 853 CharString utf8String; // UTF-8 form of text to break. 854 855 TestParams(UErrorCode &status) : dataToBreak() { 856 bi = NULL; 857 expectedBreaks = new UVector32(status); 858 srcLine = new UVector32(status); 859 srcCol = new UVector32(status); 860 textToBreak = NULL; 861 textMap = new UVector32(status); 862 } 863 864 ~TestParams() { 865 delete bi; 866 delete expectedBreaks; 867 delete srcLine; 868 delete srcCol; 869 utext_close(textToBreak); 870 delete textMap; 871 } 872 873 int32_t getSrcLine(int32_t bp); 874 int32_t getExpectedBreak(int32_t bp); 875 int32_t getSrcCol(int32_t bp); 876 877 void setUTF16(UErrorCode &status); 878 void setUTF8(UErrorCode &status); 879 }; 880 881 // Append a UnicodeString to a CharString with UTF-8 encoding. 882 // Substitute any invalid chars. 883 // Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted. 884 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) { 885 if (U_FAILURE(status)) { 886 return; 887 } 888 int32_t utf8Length; 889 u_strToUTF8WithSub(NULL, 0, &utf8Length, // Output Buffer, NULL for preflight. 890 src.getBuffer(), src.length(), // UTF-16 data 891 0xfffd, NULL, // Substitution char, number of subs. 892 &status); 893 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) { 894 return; 895 } 896 status = U_ZERO_ERROR; 897 int32_t capacity; 898 char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status); 899 u_strToUTF8WithSub(buffer, utf8Length, NULL, 900 src.getBuffer(), src.length(), 901 0xfffd, NULL, &status); 902 dest.append(buffer, utf8Length, status); 903 } 904 905 906 void TestParams::setUTF16(UErrorCode &status) { 907 textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status); 908 textMap->removeAllElements(); 909 for (int32_t i=0; i<dataToBreak.length(); i++) { 910 if (i == dataToBreak.getChar32Start(i)) { 911 textMap->addElement(i, status); 912 } else { 913 textMap->addElement(-1, status); 914 } 915 } 916 textMap->addElement(dataToBreak.length(), status); 917 U_ASSERT(dataToBreak.length() + 1 == textMap->size()); 918 } 919 920 921 void TestParams::setUTF8(UErrorCode &status) { 922 if (U_FAILURE(status)) { 923 return; 924 } 925 utf8String.clear(); 926 CharStringAppend(utf8String, dataToBreak, status); 927 textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status); 928 if (U_FAILURE(status)) { 929 return; 930 } 931 932 textMap->removeAllElements(); 933 int32_t utf16Index = 0; 934 for (;;) { 935 textMap->addElement(utf16Index, status); 936 UChar32 c32 = utext_current32(textToBreak); 937 if (c32 < 0) { 938 break; 939 } 940 utf16Index += U16_LENGTH(c32); 941 utext_next32(textToBreak); 942 while (textMap->size() < utext_getNativeIndex(textToBreak)) { 943 textMap->addElement(-1, status); 944 } 945 } 946 U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size()); 947 } 948 949 950 int32_t TestParams::getSrcLine(int bp) { 951 if (bp >= textMap->size()) { 952 bp = textMap->size() - 1; 953 } 954 int32_t i = 0; 955 for(; bp >= 0 ; --bp) { 956 // Move to a character boundary if we are not on one already. 957 i = textMap->elementAti(bp); 958 if (i >= 0) { 959 break; 960 } 961 } 962 return srcLine->elementAti(i); 963 } 964 965 966 int32_t TestParams::getExpectedBreak(int bp) { 967 if (bp >= textMap->size()) { 968 return 0; 969 } 970 int32_t i = textMap->elementAti(bp); 971 int32_t retVal = 0; 972 if (i >= 0) { 973 retVal = expectedBreaks->elementAti(i); 974 } 975 return retVal; 976 } 977 978 979 int32_t TestParams::getSrcCol(int bp) { 980 if (bp >= textMap->size()) { 981 bp = textMap->size() - 1; 982 } 983 int32_t i = 0; 984 for(; bp >= 0; --bp) { 985 // Move bp to a character boundary if we are not on one already. 986 i = textMap->elementAti(bp); 987 if (i >= 0) { 988 break; 989 } 990 } 991 return srcCol->elementAti(i); 992 } 993 994 995 void RBBITest::executeTest(TestParams *t, UErrorCode &status) { 996 int32_t bp; 997 int32_t prevBP; 998 int32_t i; 999 1000 TEST_ASSERT_SUCCESS(status); 1001 if (U_FAILURE(status)) { 1002 return; 1003 } 1004 1005 if (t->bi == NULL) { 1006 return; 1007 } 1008 1009 t->bi->setText(t->textToBreak, status); 1010 // 1011 // Run the iterator forward 1012 // 1013 prevBP = -1; 1014 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) { 1015 if (prevBP == bp) { 1016 // Fail for lack of forward progress. 1017 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d", 1018 bp, t->getSrcLine(bp), t->getSrcCol(bp)); 1019 break; 1020 } 1021 1022 // Check that there we didn't miss an expected break between the last one 1023 // and this one. 1024 for (i=prevBP+1; i<bp; i++) { 1025 if (t->getExpectedBreak(i) != 0) { 1026 int expected[] = {0, i}; 1027 printStringBreaks(t->dataToBreak, expected, 2); 1028 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1029 i, t->getSrcLine(i), t->getSrcCol(i)); 1030 } 1031 } 1032 1033 // Check that the break we did find was expected 1034 if (t->getExpectedBreak(bp) == 0) { 1035 int expected[] = {0, bp}; 1036 printStringBreaks(t->textToBreak, expected, 2); 1037 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d", 1038 bp, t->getSrcLine(bp), t->getSrcCol(bp)); 1039 } else { 1040 // The break was expected. 1041 // Check that the {nnn} tag value is correct. 1042 int32_t expectedTagVal = t->getExpectedBreak(bp); 1043 if (expectedTagVal == -1) { 1044 expectedTagVal = 0; 1045 } 1046 int32_t line = t->getSrcLine(bp); 1047 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus(); 1048 if (rs != expectedTagVal) { 1049 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n" 1050 " Actual, Expected status = %4d, %4d", 1051 bp, line, t->getSrcCol(bp), rs, expectedTagVal); 1052 } 1053 } 1054 1055 prevBP = bp; 1056 } 1057 1058 // Verify that there were no missed expected breaks after the last one found 1059 for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) { 1060 if (t->getExpectedBreak(i) != 0) { 1061 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1062 i, t->getSrcLine(i), t->getSrcCol(i)); 1063 } 1064 } 1065 1066 // 1067 // Run the iterator backwards, verify that the same breaks are found. 1068 // 1069 prevBP = utext_nativeLength(t->textToBreak)+2; // start with a phony value for the last break pos seen. 1070 for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) { 1071 if (prevBP == bp) { 1072 // Fail for lack of progress. 1073 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d", 1074 bp, t->getSrcLine(bp), t->getSrcCol(bp)); 1075 break; 1076 } 1077 1078 // Check that we didn't miss an expected break between the last one 1079 // and this one. (UVector returns zeros for index out of bounds.) 1080 for (i=prevBP-1; i>bp; i--) { 1081 if (t->getExpectedBreak(i) != 0) { 1082 errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1083 i, t->getSrcLine(i), t->getSrcCol(i)); 1084 } 1085 } 1086 1087 // Check that the break we did find was expected 1088 if (t->getExpectedBreak(bp) == 0) { 1089 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d", 1090 bp, t->getSrcLine(bp), t->getSrcCol(bp)); 1091 } else { 1092 // The break was expected. 1093 // Check that the {nnn} tag value is correct. 1094 int32_t expectedTagVal = t->getExpectedBreak(bp); 1095 if (expectedTagVal == -1) { 1096 expectedTagVal = 0; 1097 } 1098 int line = t->getSrcLine(bp); 1099 int32_t rs = t->bi->getRuleStatus(); 1100 if (rs != expectedTagVal) { 1101 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n" 1102 " Actual, Expected status = %4d, %4d", 1103 bp, line, t->getSrcCol(bp), rs, expectedTagVal); 1104 } 1105 } 1106 1107 prevBP = bp; 1108 } 1109 1110 // Verify that there were no missed breaks prior to the last one found 1111 for (i=prevBP-1; i>=0; i--) { 1112 if (t->getExpectedBreak(i) != 0) { 1113 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1114 i, t->getSrcLine(i), t->getSrcCol(i)); 1115 } 1116 } 1117 1118 // Check isBoundary() 1119 for (i=0; i < utext_nativeLength(t->textToBreak); i++) { 1120 UBool boundaryExpected = (t->getExpectedBreak(i) != 0); 1121 UBool boundaryFound = t->bi->isBoundary(i); 1122 if (boundaryExpected != boundaryFound) { 1123 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n" 1124 " Expected, Actual= %s, %s", 1125 i, t->getSrcLine(i), t->getSrcCol(i), 1126 boundaryExpected ? "true":"false", boundaryFound? "true" : "false"); 1127 } 1128 } 1129 1130 // Check following() 1131 for (i=0; i < utext_nativeLength(t->textToBreak); i++) { 1132 int32_t actualBreak = t->bi->following(i); 1133 int32_t expectedBreak = BreakIterator::DONE; 1134 for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) { 1135 if (t->getExpectedBreak(j) != 0) { 1136 expectedBreak = j; 1137 break; 1138 } 1139 } 1140 if (expectedBreak != actualBreak) { 1141 errln("following(%d) incorrect. File line,col= %4d,%4d\n" 1142 " Expected, Actual= %d, %d", 1143 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak); 1144 } 1145 } 1146 1147 // Check preceding() 1148 for (i=utext_nativeLength(t->textToBreak); i>=0; i--) { 1149 int32_t actualBreak = t->bi->preceding(i); 1150 int32_t expectedBreak = BreakIterator::DONE; 1151 1152 // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent. 1153 // preceding(trailing byte) will return the index of some preceding code point, 1154 // not the lead byte of the current code point, even though that has a smaller index. 1155 // Therefore, start looking at the expected break data not at i-1, but at 1156 // the start of code point index - 1. 1157 utext_setNativeIndex(t->textToBreak, i); 1158 int32_t j = utext_getNativeIndex(t->textToBreak) - 1; 1159 for (; j >= 0; j--) { 1160 if (t->getExpectedBreak(j) != 0) { 1161 expectedBreak = j; 1162 break; 1163 } 1164 } 1165 if (expectedBreak != actualBreak) { 1166 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n" 1167 " Expected, Actual= %d, %d", 1168 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak); 1169 } 1170 } 1171 } 1172 1173 1174 void RBBITest::TestExtended() { 1175 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 1176 UErrorCode status = U_ZERO_ERROR; 1177 Locale locale(""); 1178 1179 UnicodeString rules; 1180 TestParams tp(status); 1181 1182 RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status); 1183 if (U_FAILURE(status)) { 1184 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status)); 1185 } 1186 1187 1188 // 1189 // Open and read the test data file. 1190 // 1191 const char *testDataDirectory = IntlTest::getSourceTestData(status); 1192 char testFileName[1000]; 1193 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) { 1194 errln("Can't open test data. Path too long."); 1195 return; 1196 } 1197 strcpy(testFileName, testDataDirectory); 1198 strcat(testFileName, "rbbitst.txt"); 1199 1200 int len; 1201 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status); 1202 if (U_FAILURE(status)) { 1203 return; /* something went wrong, error already output */ 1204 } 1205 1206 1207 bool skipTest = false; // Skip this test? 1208 1209 // 1210 // Put the test data into a UnicodeString 1211 // 1212 UnicodeString testString(FALSE, testFile, len); 1213 1214 enum EParseState{ 1215 PARSE_COMMENT, 1216 PARSE_TAG, 1217 PARSE_DATA, 1218 PARSE_NUM 1219 } 1220 parseState = PARSE_TAG; 1221 1222 EParseState savedState = PARSE_TAG; 1223 1224 static const UChar CH_LF = 0x0a; 1225 static const UChar CH_CR = 0x0d; 1226 static const UChar CH_HASH = 0x23; 1227 /*static const UChar CH_PERIOD = 0x2e;*/ 1228 static const UChar CH_LT = 0x3c; 1229 static const UChar CH_GT = 0x3e; 1230 static const UChar CH_BACKSLASH = 0x5c; 1231 static const UChar CH_BULLET = 0x2022; 1232 1233 int32_t lineNum = 1; 1234 int32_t colStart = 0; 1235 int32_t column = 0; 1236 int32_t charIdx = 0; 1237 1238 int32_t tagValue = 0; // The numeric value of a <nnn> tag. 1239 1240 for (charIdx = 0; charIdx < len; ) { 1241 status = U_ZERO_ERROR; 1242 UChar c = testString.charAt(charIdx); 1243 charIdx++; 1244 if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) { 1245 // treat CRLF as a unit 1246 c = CH_LF; 1247 charIdx++; 1248 } 1249 if (c == CH_LF || c == CH_CR) { 1250 lineNum++; 1251 colStart = charIdx; 1252 } 1253 column = charIdx - colStart + 1; 1254 1255 switch (parseState) { 1256 case PARSE_COMMENT: 1257 if (c == 0x0a || c == 0x0d) { 1258 parseState = savedState; 1259 } 1260 break; 1261 1262 case PARSE_TAG: 1263 { 1264 if (c == CH_HASH) { 1265 parseState = PARSE_COMMENT; 1266 savedState = PARSE_TAG; 1267 break; 1268 } 1269 if (u_isUWhiteSpace(c)) { 1270 break; 1271 } 1272 if (testString.compare(charIdx-1, 6, "<word>") == 0) { 1273 delete tp.bi; 1274 tp.bi = BreakIterator::createWordInstance(locale, status); 1275 skipTest = false; 1276 charIdx += 5; 1277 break; 1278 } 1279 if (testString.compare(charIdx-1, 6, "<char>") == 0) { 1280 delete tp.bi; 1281 tp.bi = BreakIterator::createCharacterInstance(locale, status); 1282 skipTest = false; 1283 charIdx += 5; 1284 break; 1285 } 1286 if (testString.compare(charIdx-1, 6, "<line>") == 0) { 1287 delete tp.bi; 1288 tp.bi = BreakIterator::createLineInstance(locale, status); 1289 skipTest = false; 1290 charIdx += 5; 1291 break; 1292 } 1293 if (testString.compare(charIdx-1, 6, "<sent>") == 0) { 1294 delete tp.bi; 1295 tp.bi = BreakIterator::createSentenceInstance(locale, status); 1296 skipTest = false; 1297 charIdx += 5; 1298 break; 1299 } 1300 if (testString.compare(charIdx-1, 7, "<title>") == 0) { 1301 delete tp.bi; 1302 tp.bi = BreakIterator::createTitleInstance(locale, status); 1303 charIdx += 6; 1304 break; 1305 } 1306 1307 // <locale loc_name> 1308 localeMatcher.reset(testString); 1309 if (localeMatcher.lookingAt(charIdx-1, status)) { 1310 UnicodeString localeName = localeMatcher.group(1, status); 1311 char localeName8[100]; 1312 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0); 1313 locale = Locale::createFromName(localeName8); 1314 charIdx += localeMatcher.group(0, status).length() - 1; 1315 TEST_ASSERT_SUCCESS(status); 1316 break; 1317 } 1318 if (testString.compare(charIdx-1, 6, "<data>") == 0) { 1319 parseState = PARSE_DATA; 1320 charIdx += 5; 1321 tp.dataToBreak = ""; 1322 tp.expectedBreaks->removeAllElements(); 1323 tp.srcCol ->removeAllElements(); 1324 tp.srcLine->removeAllElements(); 1325 break; 1326 } 1327 1328 errln("line %d: Tag expected in test file.", lineNum); 1329 parseState = PARSE_COMMENT; 1330 savedState = PARSE_DATA; 1331 goto end_test; // Stop the test. 1332 } 1333 break; 1334 1335 case PARSE_DATA: 1336 if (c == CH_BULLET) { 1337 int32_t breakIdx = tp.dataToBreak.length(); 1338 tp.expectedBreaks->setSize(breakIdx+1); 1339 tp.expectedBreaks->setElementAt(-1, breakIdx); 1340 tp.srcLine->setSize(breakIdx+1); 1341 tp.srcLine->setElementAt(lineNum, breakIdx); 1342 tp.srcCol ->setSize(breakIdx+1); 1343 tp.srcCol ->setElementAt(column, breakIdx); 1344 break; 1345 } 1346 1347 if (testString.compare(charIdx-1, 7, "</data>") == 0) { 1348 // Add final entry to mappings from break location to source file position. 1349 // Need one extra because last break position returned is after the 1350 // last char in the data, not at the last char. 1351 tp.srcLine->addElement(lineNum, status); 1352 tp.srcCol ->addElement(column, status); 1353 1354 parseState = PARSE_TAG; 1355 charIdx += 6; 1356 1357 if (!skipTest) { 1358 // RUN THE TEST! 1359 status = U_ZERO_ERROR; 1360 tp.setUTF16(status); 1361 executeTest(&tp, status); 1362 TEST_ASSERT_SUCCESS(status); 1363 1364 // Run again, this time with UTF-8 text wrapped in a UText. 1365 status = U_ZERO_ERROR; 1366 tp.setUTF8(status); 1367 TEST_ASSERT_SUCCESS(status); 1368 executeTest(&tp, status); 1369 } 1370 break; 1371 } 1372 1373 if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) { 1374 // Named character, e.g. \N{COMBINING GRAVE ACCENT} 1375 // Get the code point from the name and insert it into the test data. 1376 // (Damn, no API takes names in Unicode !!! 1377 // we've got to take it back to char *) 1378 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx); 1379 int32_t nameLength = nameEndIdx - (charIdx+2); 1380 char charNameBuf[200]; 1381 UChar32 theChar = -1; 1382 if (nameEndIdx != -1) { 1383 UErrorCode status = U_ZERO_ERROR; 1384 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf)); 1385 charNameBuf[sizeof(charNameBuf)-1] = 0; 1386 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status); 1387 if (U_FAILURE(status)) { 1388 theChar = -1; 1389 } 1390 } 1391 if (theChar == -1) { 1392 errln("Error in named character in test file at line %d, col %d", 1393 lineNum, column); 1394 } else { 1395 // Named code point was recognized. Insert it 1396 // into the test data. 1397 tp.dataToBreak.append(theChar); 1398 while (tp.dataToBreak.length() > tp.srcLine->size()) { 1399 tp.srcLine->addElement(lineNum, status); 1400 tp.srcCol ->addElement(column, status); 1401 } 1402 } 1403 if (nameEndIdx > charIdx) { 1404 charIdx = nameEndIdx+1; 1405 1406 } 1407 break; 1408 } 1409 1410 1411 1412 1413 if (testString.compare(charIdx-1, 2, "<>") == 0) { 1414 charIdx++; 1415 int32_t breakIdx = tp.dataToBreak.length(); 1416 tp.expectedBreaks->setSize(breakIdx+1); 1417 tp.expectedBreaks->setElementAt(-1, breakIdx); 1418 tp.srcLine->setSize(breakIdx+1); 1419 tp.srcLine->setElementAt(lineNum, breakIdx); 1420 tp.srcCol ->setSize(breakIdx+1); 1421 tp.srcCol ->setElementAt(column, breakIdx); 1422 break; 1423 } 1424 1425 if (c == CH_LT) { 1426 tagValue = 0; 1427 parseState = PARSE_NUM; 1428 break; 1429 } 1430 1431 if (c == CH_HASH && column==3) { // TODO: why is column off so far? 1432 parseState = PARSE_COMMENT; 1433 savedState = PARSE_DATA; 1434 break; 1435 } 1436 1437 if (c == CH_BACKSLASH) { 1438 // Check for \ at end of line, a line continuation. 1439 // Advance over (discard) the newline 1440 UChar32 cp = testString.char32At(charIdx); 1441 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) { 1442 // We have a CR LF 1443 // Need an extra increment of the input ptr to move over both of them 1444 charIdx++; 1445 } 1446 if (cp == CH_LF || cp == CH_CR) { 1447 lineNum++; 1448 colStart = charIdx; 1449 charIdx++; 1450 break; 1451 } 1452 1453 // Let unescape handle the back slash. 1454 cp = testString.unescapeAt(charIdx); 1455 if (cp != -1) { 1456 // Escape sequence was recognized. Insert the char 1457 // into the test data. 1458 tp.dataToBreak.append(cp); 1459 while (tp.dataToBreak.length() > tp.srcLine->size()) { 1460 tp.srcLine->addElement(lineNum, status); 1461 tp.srcCol ->addElement(column, status); 1462 } 1463 break; 1464 } 1465 1466 1467 // Not a recognized backslash escape sequence. 1468 // Take the next char as a literal. 1469 // TODO: Should this be an error? 1470 c = testString.charAt(charIdx); 1471 charIdx = testString.moveIndex32(charIdx, 1); 1472 } 1473 1474 // Normal, non-escaped data char. 1475 tp.dataToBreak.append(c); 1476 1477 // Save the mapping from offset in the data to line/column numbers in 1478 // the original input file. Will be used for better error messages only. 1479 // If there's an expected break before this char, the slot in the mapping 1480 // vector will already be set for this char; don't overwrite it. 1481 if (tp.dataToBreak.length() > tp.srcLine->size()) { 1482 tp.srcLine->addElement(lineNum, status); 1483 tp.srcCol ->addElement(column, status); 1484 } 1485 break; 1486 1487 1488 case PARSE_NUM: 1489 // We are parsing an expected numeric tag value, like <1234>, 1490 // within a chunk of data. 1491 if (u_isUWhiteSpace(c)) { 1492 break; 1493 } 1494 1495 if (c == CH_GT) { 1496 // Finished the number. Add the info to the expected break data, 1497 // and switch parse state back to doing plain data. 1498 parseState = PARSE_DATA; 1499 if (tagValue == 0) { 1500 tagValue = -1; 1501 } 1502 int32_t breakIdx = tp.dataToBreak.length(); 1503 tp.expectedBreaks->setSize(breakIdx+1); 1504 tp.expectedBreaks->setElementAt(tagValue, breakIdx); 1505 tp.srcLine->setSize(breakIdx+1); 1506 tp.srcLine->setElementAt(lineNum, breakIdx); 1507 tp.srcCol ->setSize(breakIdx+1); 1508 tp.srcCol ->setElementAt(column, breakIdx); 1509 break; 1510 } 1511 1512 if (u_isdigit(c)) { 1513 tagValue = tagValue*10 + u_charDigitValue(c); 1514 break; 1515 } 1516 1517 errln("Syntax Error in test file at line %d, col %d", 1518 lineNum, column); 1519 parseState = PARSE_COMMENT; 1520 goto end_test; // Stop the test 1521 break; 1522 } 1523 1524 1525 if (U_FAILURE(status)) { 1526 dataerrln("ICU Error %s while parsing test file at line %d.", 1527 u_errorName(status), lineNum); 1528 status = U_ZERO_ERROR; 1529 goto end_test; // Stop the test 1530 } 1531 1532 } 1533 1534 end_test: 1535 delete [] testFile; 1536 #endif 1537 } 1538 1539 1540 //------------------------------------------------------------------------------- 1541 // 1542 // TestDictRules create a break iterator from source rules that includes a 1543 // dictionary range. Regression for bug #7130. Source rules 1544 // do not declare a break iterator type (word, line, sentence, etc. 1545 // but the dictionary code, without a type, would loop. 1546 // 1547 //------------------------------------------------------------------------------- 1548 void RBBITest::TestDictRules() { 1549 const char *rules = "$dictionary = [a-z]; \n" 1550 "!!forward; \n" 1551 "$dictionary $dictionary; \n" 1552 "!!reverse; \n" 1553 "$dictionary $dictionary; \n"; 1554 const char *text = "aa"; 1555 UErrorCode status = U_ZERO_ERROR; 1556 UParseError parseError; 1557 1558 RuleBasedBreakIterator bi(rules, parseError, status); 1559 if (U_SUCCESS(status)) { 1560 UnicodeString utext = text; 1561 bi.setText(utext); 1562 int32_t position; 1563 int32_t loops; 1564 for (loops = 0; loops<10; loops++) { 1565 position = bi.next(); 1566 if (position == RuleBasedBreakIterator::DONE) { 1567 break; 1568 } 1569 } 1570 TEST_ASSERT(loops == 1); 1571 } else { 1572 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status)); 1573 } 1574 } 1575 1576 1577 1578 //------------------------------------------------------------------------------- 1579 // 1580 // ReadAndConvertFile Read a text data file, convert it to UChars, and 1581 // return the datain one big UChar * buffer, which the caller must delete. 1582 // 1583 // parameters: 1584 // fileName: the name of the file, with no directory part. The test data directory 1585 // is assumed. 1586 // ulen an out parameter, receives the actual length (in UChars) of the file data. 1587 // encoding The file encoding. If the file contains a BOM, that will override the encoding 1588 // specified here. The BOM, if it exists, will be stripped from the returned data. 1589 // Pass NULL for the system default encoding. 1590 // status 1591 // returns: 1592 // The file data, converted to UChar. 1593 // The caller must delete this when done with 1594 // delete [] theBuffer; 1595 // 1596 // TODO: This is a clone of RegexTest::ReadAndConvertFile. 1597 // Move this function to some common place. 1598 // 1599 //-------------------------------------------------------------------------------- 1600 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) { 1601 UChar *retPtr = NULL; 1602 char *fileBuf = NULL; 1603 UConverter* conv = NULL; 1604 FILE *f = NULL; 1605 1606 ulen = 0; 1607 if (U_FAILURE(status)) { 1608 return retPtr; 1609 } 1610 1611 // 1612 // Open the file. 1613 // 1614 f = fopen(fileName, "rb"); 1615 if (f == 0) { 1616 dataerrln("Error opening test data file %s\n", fileName); 1617 status = U_FILE_ACCESS_ERROR; 1618 return NULL; 1619 } 1620 // 1621 // Read it in 1622 // 1623 int fileSize; 1624 int amt_read; 1625 1626 fseek( f, 0, SEEK_END); 1627 fileSize = ftell(f); 1628 fileBuf = new char[fileSize]; 1629 fseek(f, 0, SEEK_SET); 1630 amt_read = fread(fileBuf, 1, fileSize, f); 1631 if (amt_read != fileSize || fileSize <= 0) { 1632 errln("Error reading test data file."); 1633 goto cleanUpAndReturn; 1634 } 1635 1636 // 1637 // Look for a Unicode Signature (BOM) on the data just read 1638 // 1639 int32_t signatureLength; 1640 const char * fileBufC; 1641 const char* bomEncoding; 1642 1643 fileBufC = fileBuf; 1644 bomEncoding = ucnv_detectUnicodeSignature( 1645 fileBuf, fileSize, &signatureLength, &status); 1646 if(bomEncoding!=NULL ){ 1647 fileBufC += signatureLength; 1648 fileSize -= signatureLength; 1649 encoding = bomEncoding; 1650 } 1651 1652 // 1653 // Open a converter to take the rule file to UTF-16 1654 // 1655 conv = ucnv_open(encoding, &status); 1656 if (U_FAILURE(status)) { 1657 goto cleanUpAndReturn; 1658 } 1659 1660 // 1661 // Convert the rules to UChar. 1662 // Preflight first to determine required buffer size. 1663 // 1664 ulen = ucnv_toUChars(conv, 1665 NULL, // dest, 1666 0, // destCapacity, 1667 fileBufC, 1668 fileSize, 1669 &status); 1670 if (status == U_BUFFER_OVERFLOW_ERROR) { 1671 // Buffer Overflow is expected from the preflight operation. 1672 status = U_ZERO_ERROR; 1673 1674 retPtr = new UChar[ulen+1]; 1675 ucnv_toUChars(conv, 1676 retPtr, // dest, 1677 ulen+1, 1678 fileBufC, 1679 fileSize, 1680 &status); 1681 } 1682 1683 cleanUpAndReturn: 1684 fclose(f); 1685 delete []fileBuf; 1686 ucnv_close(conv); 1687 if (U_FAILURE(status)) { 1688 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 1689 delete []retPtr; 1690 retPtr = 0; 1691 ulen = 0; 1692 }; 1693 return retPtr; 1694 } 1695 1696 1697 1698 //-------------------------------------------------------------------------------------------- 1699 // 1700 // Run tests from each of the boundary test data files distributed by the Unicode Consortium 1701 // 1702 //------------------------------------------------------------------------------------------- 1703 void RBBITest::TestUnicodeFiles() { 1704 RuleBasedBreakIterator *bi; 1705 UErrorCode status = U_ZERO_ERROR; 1706 1707 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status); 1708 TEST_ASSERT_SUCCESS(status); 1709 if (U_SUCCESS(status)) { 1710 runUnicodeTestData("GraphemeBreakTest.txt", bi); 1711 } 1712 delete bi; 1713 1714 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status); 1715 TEST_ASSERT_SUCCESS(status); 1716 if (U_SUCCESS(status)) { 1717 runUnicodeTestData("WordBreakTest.txt", bi); 1718 } 1719 delete bi; 1720 1721 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status); 1722 TEST_ASSERT_SUCCESS(status); 1723 if (U_SUCCESS(status)) { 1724 runUnicodeTestData("SentenceBreakTest.txt", bi); 1725 } 1726 delete bi; 1727 1728 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status); 1729 TEST_ASSERT_SUCCESS(status); 1730 if (U_SUCCESS(status)) { 1731 runUnicodeTestData("LineBreakTest.txt", bi); 1732 } 1733 delete bi; 1734 } 1735 1736 1737 // Check for test cases from the Unicode test data files that are known to fail 1738 // and should be skipped because ICU is not yet able to fully implement the spec. 1739 // See ticket #7270. 1740 1741 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) { 1742 static const UChar badTestCases[][4] = { // Line Numbers from Unicode 7.0.0 file. 1743 {(UChar)0x200B, (UChar)0x0020, (UChar)0x007D, (UChar)0x0000}, // Line 5198 1744 {(UChar)0x200B, (UChar)0x0020, (UChar)0x0029, (UChar)0x0000}, // Line 5202 1745 {(UChar)0x200B, (UChar)0x0020, (UChar)0x0021, (UChar)0x0000}, // Line 5214 1746 {(UChar)0x200B, (UChar)0x0020, (UChar)0x002c, (UChar)0x0000}, // Line 5246 1747 {(UChar)0x200B, (UChar)0x0020, (UChar)0x002f, (UChar)0x0000}, // Line 5298 1748 {(UChar)0x200B, (UChar)0x0020, (UChar)0x2060, (UChar)0x0000} // Line 5302 1749 }; 1750 if (strcmp(fileName, "LineBreakTest.txt") != 0) { 1751 return FALSE; 1752 } 1753 1754 for (int i=0; i<UPRV_LENGTHOF(badTestCases); i++) { 1755 if (testCase == UnicodeString(badTestCases[i])) { 1756 return logKnownIssue("7270"); 1757 } 1758 } 1759 return FALSE; 1760 } 1761 1762 1763 //-------------------------------------------------------------------------------------------- 1764 // 1765 // Run tests from one of the boundary test data files distributed by the Unicode Consortium 1766 // 1767 //------------------------------------------------------------------------------------------- 1768 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) { 1769 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 1770 UErrorCode status = U_ZERO_ERROR; 1771 1772 // 1773 // Open and read the test data file, put it into a UnicodeString. 1774 // 1775 const char *testDataDirectory = IntlTest::getSourceTestData(status); 1776 char testFileName[1000]; 1777 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) { 1778 dataerrln("Can't open test data. Path too long."); 1779 return; 1780 } 1781 strcpy(testFileName, testDataDirectory); 1782 strcat(testFileName, fileName); 1783 1784 logln("Opening data file %s\n", fileName); 1785 1786 int len; 1787 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status); 1788 if (status != U_FILE_ACCESS_ERROR) { 1789 TEST_ASSERT_SUCCESS(status); 1790 TEST_ASSERT(testFile != NULL); 1791 } 1792 if (U_FAILURE(status) || testFile == NULL) { 1793 return; /* something went wrong, error already output */ 1794 } 1795 UnicodeString testFileAsString(TRUE, testFile, len); 1796 1797 // 1798 // Parse the test data file using a regular expression. 1799 // Each kind of token is recognized in its own capture group; what type of item was scanned 1800 // is identified by which group had a match. 1801 // 1802 // Caputure Group # 1 2 3 4 5 1803 // Parses this item: divide x hex digits comment \n unrecognized \n 1804 // 1805 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV); 1806 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status); 1807 UnicodeString testString; 1808 UVector32 breakPositions(status); 1809 int lineNumber = 1; 1810 TEST_ASSERT_SUCCESS(status); 1811 if (U_FAILURE(status)) { 1812 return; 1813 } 1814 1815 // 1816 // Scan through each test case, building up the string to be broken in testString, 1817 // and the positions that should be boundaries in the breakPositions vector. 1818 // 1819 int spin = 0; 1820 while (tokenMatcher.find()) { 1821 if(tokenMatcher.hitEnd()) { 1822 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for. 1823 This occurred when the text file was corrupt (wasn't marked as UTF-8) 1824 and caused an infinite loop here on EBCDIC systems! 1825 */ 1826 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin); 1827 // return; 1828 } 1829 if (tokenMatcher.start(1, status) >= 0) { 1830 // Scanned a divide sign, indicating a break position in the test data. 1831 if (testString.length()>0) { 1832 breakPositions.addElement(testString.length(), status); 1833 } 1834 } 1835 else if (tokenMatcher.start(2, status) >= 0) { 1836 // Scanned an 'x', meaning no break at this position in the test data 1837 // Nothing to be done here. 1838 } 1839 else if (tokenMatcher.start(3, status) >= 0) { 1840 // Scanned Hex digits. Convert them to binary, append to the character data string. 1841 const UnicodeString &hexNumber = tokenMatcher.group(3, status); 1842 int length = hexNumber.length(); 1843 if (length<=8) { 1844 char buf[10]; 1845 hexNumber.extract (0, length, buf, sizeof(buf), US_INV); 1846 UChar32 c = (UChar32)strtol(buf, NULL, 16); 1847 if (c<=0x10ffff) { 1848 testString.append(c); 1849 } else { 1850 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n", 1851 fileName, lineNumber); 1852 } 1853 } else { 1854 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n", 1855 fileName, lineNumber); 1856 } 1857 } 1858 else if (tokenMatcher.start(4, status) >= 0) { 1859 // Scanned to end of a line, possibly skipping over a comment in the process. 1860 // If the line from the file contained test data, run the test now. 1861 if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) { 1862 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi); 1863 } 1864 1865 // Clear out this test case. 1866 // The string and breakPositions vector will be refilled as the next 1867 // test case is parsed. 1868 testString.remove(); 1869 breakPositions.removeAllElements(); 1870 lineNumber++; 1871 } else { 1872 // Scanner catchall. Something unrecognized appeared on the line. 1873 char token[16]; 1874 UnicodeString uToken = tokenMatcher.group(0, status); 1875 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token)); 1876 token[sizeof(token)-1] = 0; 1877 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token); 1878 1879 // Clean up, in preparation for continuing with the next line. 1880 testString.remove(); 1881 breakPositions.removeAllElements(); 1882 lineNumber++; 1883 } 1884 TEST_ASSERT_SUCCESS(status); 1885 if (U_FAILURE(status)) { 1886 break; 1887 } 1888 } 1889 1890 delete [] testFile; 1891 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 1892 } 1893 1894 //-------------------------------------------------------------------------------------------- 1895 // 1896 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium 1897 // test data files. Do only a simple, forward-only check - 1898 // this test is mostly to check that ICU and the Unicode 1899 // data agree with each other. 1900 // 1901 //-------------------------------------------------------------------------------------------- 1902 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber, 1903 const UnicodeString &testString, // Text data to be broken 1904 UVector32 *breakPositions, // Positions where breaks should be found. 1905 RuleBasedBreakIterator *bi) { 1906 int32_t pos; // Break Position in the test string 1907 int32_t expectedI = 0; // Index of expected break position in the vector of expected results. 1908 int32_t expectedPos; // Expected break position (index into test string) 1909 1910 bi->setText(testString); 1911 pos = bi->first(); 1912 pos = bi->next(); 1913 1914 while (pos != BreakIterator::DONE) { 1915 if (expectedI >= breakPositions->size()) { 1916 errln("Test file \"%s\", line %d, unexpected break found at position %d", 1917 testFileName, lineNumber, pos); 1918 break; 1919 } 1920 expectedPos = breakPositions->elementAti(expectedI); 1921 if (pos < expectedPos) { 1922 errln("Test file \"%s\", line %d, unexpected break found at position %d", 1923 testFileName, lineNumber, pos); 1924 break; 1925 } 1926 if (pos > expectedPos) { 1927 errln("Test file \"%s\", line %d, failed to find expected break at position %d", 1928 testFileName, lineNumber, expectedPos); 1929 break; 1930 } 1931 pos = bi->next(); 1932 expectedI++; 1933 } 1934 1935 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) { 1936 errln("Test file \"%s\", line %d, failed to find expected break at position %d", 1937 testFileName, lineNumber, breakPositions->elementAti(expectedI)); 1938 } 1939 } 1940 1941 1942 1943 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 1944 //--------------------------------------------------------------------------------------- 1945 // 1946 // classs RBBIMonkeyKind 1947 // 1948 // Monkey Test for Break Iteration 1949 // Abstract interface class. Concrete derived classes independently 1950 // implement the break rules for different iterator types. 1951 // 1952 // The Monkey Test itself uses doesn't know which type of break iterator it is 1953 // testing, but works purely in terms of the interface defined here. 1954 // 1955 //--------------------------------------------------------------------------------------- 1956 class RBBIMonkeyKind { 1957 public: 1958 // Return a UVector of UnicodeSets, representing the character classes used 1959 // for this type of iterator. 1960 virtual UVector *charClasses() = 0; 1961 1962 // Set the test text on which subsequent calls to next() will operate 1963 virtual void setText(const UnicodeString &s) = 0; 1964 1965 // Find the next break postion, starting from the prev break position, or from zero. 1966 // Return -1 after reaching end of string. 1967 virtual int32_t next(int32_t i) = 0; 1968 1969 virtual ~RBBIMonkeyKind(); 1970 UErrorCode deferredStatus; 1971 1972 1973 protected: 1974 RBBIMonkeyKind(); 1975 1976 private: 1977 }; 1978 1979 RBBIMonkeyKind::RBBIMonkeyKind() { 1980 deferredStatus = U_ZERO_ERROR; 1981 } 1982 1983 RBBIMonkeyKind::~RBBIMonkeyKind() { 1984 } 1985 1986 1987 //---------------------------------------------------------------------------------------- 1988 // 1989 // Random Numbers. Similar to standard lib rand() and srand() 1990 // Not using library to 1991 // 1. Get same results on all platforms. 1992 // 2. Get access to current seed, to more easily reproduce failures. 1993 // 1994 //--------------------------------------------------------------------------------------- 1995 static uint32_t m_seed = 1; 1996 1997 static uint32_t m_rand() 1998 { 1999 m_seed = m_seed * 1103515245 + 12345; 2000 return (uint32_t)(m_seed/65536) % 32768; 2001 } 2002 2003 2004 //------------------------------------------------------------------------------------------ 2005 // 2006 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation 2007 // of RBBIMonkeyKind. 2008 // 2009 //------------------------------------------------------------------------------------------ 2010 class RBBICharMonkey: public RBBIMonkeyKind { 2011 public: 2012 RBBICharMonkey(); 2013 virtual ~RBBICharMonkey(); 2014 virtual UVector *charClasses(); 2015 virtual void setText(const UnicodeString &s); 2016 virtual int32_t next(int32_t i); 2017 private: 2018 UVector *fSets; 2019 2020 UnicodeSet *fCRLFSet; 2021 UnicodeSet *fControlSet; 2022 UnicodeSet *fExtendSet; 2023 UnicodeSet *fRegionalIndicatorSet; 2024 UnicodeSet *fPrependSet; 2025 UnicodeSet *fSpacingSet; 2026 UnicodeSet *fLSet; 2027 UnicodeSet *fVSet; 2028 UnicodeSet *fTSet; 2029 UnicodeSet *fLVSet; 2030 UnicodeSet *fLVTSet; 2031 UnicodeSet *fHangulSet; 2032 UnicodeSet *fAnySet; 2033 2034 const UnicodeString *fText; 2035 }; 2036 2037 2038 RBBICharMonkey::RBBICharMonkey() { 2039 UErrorCode status = U_ZERO_ERROR; 2040 2041 fText = NULL; 2042 2043 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status); 2044 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status); 2045 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status); 2046 fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status); 2047 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status); 2048 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status); 2049 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status); 2050 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status); 2051 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status); 2052 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status); 2053 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status); 2054 fHangulSet = new UnicodeSet(); 2055 fHangulSet->addAll(*fLSet); 2056 fHangulSet->addAll(*fVSet); 2057 fHangulSet->addAll(*fTSet); 2058 fHangulSet->addAll(*fLVSet); 2059 fHangulSet->addAll(*fLVTSet); 2060 fAnySet = new UnicodeSet(0, 0x10ffff); 2061 2062 fSets = new UVector(status); 2063 fSets->addElement(fCRLFSet, status); 2064 fSets->addElement(fControlSet, status); 2065 fSets->addElement(fExtendSet, status); 2066 fSets->addElement(fRegionalIndicatorSet, status); 2067 if (!fPrependSet->isEmpty()) { 2068 fSets->addElement(fPrependSet, status); 2069 } 2070 fSets->addElement(fSpacingSet, status); 2071 fSets->addElement(fHangulSet, status); 2072 fSets->addElement(fAnySet, status); 2073 if (U_FAILURE(status)) { 2074 deferredStatus = status; 2075 } 2076 } 2077 2078 2079 void RBBICharMonkey::setText(const UnicodeString &s) { 2080 fText = &s; 2081 } 2082 2083 2084 2085 int32_t RBBICharMonkey::next(int32_t prevPos) { 2086 int p0, p1, p2, p3; // Indices of the significant code points around the 2087 // break position being tested. The candidate break 2088 // location is before p2. 2089 2090 int breakPos = -1; 2091 2092 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 2093 2094 if (U_FAILURE(deferredStatus)) { 2095 return -1; 2096 } 2097 2098 // Previous break at end of string. return DONE. 2099 if (prevPos >= fText->length()) { 2100 return -1; 2101 } 2102 p0 = p1 = p2 = p3 = prevPos; 2103 c3 = fText->char32At(prevPos); 2104 c0 = c1 = c2 = 0; 2105 (void)p0; // suppress set but not used warning. 2106 (void)c0; 2107 2108 // Loop runs once per "significant" character position in the input text. 2109 for (;;) { 2110 // Move all of the positions forward in the input string. 2111 p0 = p1; c0 = c1; 2112 p1 = p2; c1 = c2; 2113 p2 = p3; c2 = c3; 2114 2115 // Advancd p3 by one codepoint 2116 p3 = fText->moveIndex32(p3, 1); 2117 c3 = fText->char32At(p3); 2118 2119 if (p1 == p2) { 2120 // Still warming up the loop. (won't work with zero length strings, but we don't care) 2121 continue; 2122 } 2123 if (p2 == fText->length()) { 2124 // Reached end of string. Always a break position. 2125 break; 2126 } 2127 2128 // Rule GB3 CR x LF 2129 // No Extend or Format characters may appear between the CR and LF, 2130 // which requires the additional check for p2 immediately following p1. 2131 // 2132 if (c1==0x0D && c2==0x0A && p1==(p2-1)) { 2133 continue; 2134 } 2135 2136 // Rule (GB4). ( Control | CR | LF ) <break> 2137 if (fControlSet->contains(c1) || 2138 c1 == 0x0D || 2139 c1 == 0x0A) { 2140 break; 2141 } 2142 2143 // Rule (GB5) <break> ( Control | CR | LF ) 2144 // 2145 if (fControlSet->contains(c2) || 2146 c2 == 0x0D || 2147 c2 == 0x0A) { 2148 break; 2149 } 2150 2151 2152 // Rule (GB6) L x ( L | V | LV | LVT ) 2153 if (fLSet->contains(c1) && 2154 (fLSet->contains(c2) || 2155 fVSet->contains(c2) || 2156 fLVSet->contains(c2) || 2157 fLVTSet->contains(c2))) { 2158 continue; 2159 } 2160 2161 // Rule (GB7) ( LV | V ) x ( V | T ) 2162 if ((fLVSet->contains(c1) || fVSet->contains(c1)) && 2163 (fVSet->contains(c2) || fTSet->contains(c2))) { 2164 continue; 2165 } 2166 2167 // Rule (GB8) ( LVT | T) x T 2168 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) && 2169 fTSet->contains(c2)) { 2170 continue; 2171 } 2172 2173 // Rule (GB8a) Regional_Indicator x Regional_Indicator 2174 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) { 2175 continue; 2176 } 2177 2178 // Rule (GB9) Numeric x ALetter 2179 if (fExtendSet->contains(c2)) { 2180 continue; 2181 } 2182 2183 // Rule (GB9a) x SpacingMark 2184 if (fSpacingSet->contains(c2)) { 2185 continue; 2186 } 2187 2188 // Rule (GB9b) Prepend x 2189 if (fPrependSet->contains(c1)) { 2190 continue; 2191 } 2192 2193 // Rule (GB10) Any <break> Any 2194 break; 2195 } 2196 2197 breakPos = p2; 2198 return breakPos; 2199 } 2200 2201 2202 2203 UVector *RBBICharMonkey::charClasses() { 2204 return fSets; 2205 } 2206 2207 2208 RBBICharMonkey::~RBBICharMonkey() { 2209 delete fSets; 2210 delete fCRLFSet; 2211 delete fControlSet; 2212 delete fExtendSet; 2213 delete fRegionalIndicatorSet; 2214 delete fPrependSet; 2215 delete fSpacingSet; 2216 delete fLSet; 2217 delete fVSet; 2218 delete fTSet; 2219 delete fLVSet; 2220 delete fLVTSet; 2221 delete fHangulSet; 2222 delete fAnySet; 2223 } 2224 2225 //------------------------------------------------------------------------------------------ 2226 // 2227 // class RBBIWordMonkey Word Break specific implementation 2228 // of RBBIMonkeyKind. 2229 // 2230 //------------------------------------------------------------------------------------------ 2231 class RBBIWordMonkey: public RBBIMonkeyKind { 2232 public: 2233 RBBIWordMonkey(); 2234 virtual ~RBBIWordMonkey(); 2235 virtual UVector *charClasses(); 2236 virtual void setText(const UnicodeString &s); 2237 virtual int32_t next(int32_t i); 2238 private: 2239 UVector *fSets; 2240 2241 UnicodeSet *fCRSet; 2242 UnicodeSet *fLFSet; 2243 UnicodeSet *fNewlineSet; 2244 UnicodeSet *fRegionalIndicatorSet; 2245 UnicodeSet *fKatakanaSet; 2246 UnicodeSet *fHebrew_LetterSet; 2247 UnicodeSet *fALetterSet; 2248 // TODO(jungshik): Do we still need this change? 2249 // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt 2250 UnicodeSet *fSingle_QuoteSet; 2251 UnicodeSet *fDouble_QuoteSet; 2252 UnicodeSet *fMidNumLetSet; 2253 UnicodeSet *fMidLetterSet; 2254 UnicodeSet *fMidNumSet; 2255 UnicodeSet *fNumericSet; 2256 UnicodeSet *fFormatSet; 2257 UnicodeSet *fOtherSet; 2258 UnicodeSet *fExtendSet; 2259 UnicodeSet *fExtendNumLetSet; 2260 UnicodeSet *fDictionaryCjkSet; 2261 2262 const UnicodeString *fText; 2263 }; 2264 2265 2266 RBBIWordMonkey::RBBIWordMonkey() 2267 { 2268 UErrorCode status = U_ZERO_ERROR; 2269 2270 fSets = new UVector(status); 2271 2272 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status); 2273 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status); 2274 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status); 2275 fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status); 2276 // Exclude Hangul syllables from ALetterSet during testing. 2277 // Leave CJK dictionary characters out from the monkey tests! 2278 #if 0 2279 fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}" 2280 "[\\p{Line_Break = Complex_Context}" 2281 "-\\p{Grapheme_Cluster_Break = Extend}" 2282 "-\\p{Grapheme_Cluster_Break = Control}" 2283 "]]", 2284 status); 2285 #endif 2286 fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status); 2287 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status); 2288 fHebrew_LetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status); 2289 fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status); 2290 fALetterSet->removeAll(*fDictionaryCjkSet); 2291 fSingle_QuoteSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"), status); 2292 fDouble_QuoteSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"), status); 2293 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status); 2294 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status); 2295 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status); 2296 // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test 2297 // we should figure out why 2298 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status); 2299 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status); 2300 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status); 2301 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status); 2302 2303 fOtherSet = new UnicodeSet(); 2304 if(U_FAILURE(status)) { 2305 deferredStatus = status; 2306 return; 2307 } 2308 2309 fOtherSet->complement(); 2310 fOtherSet->removeAll(*fCRSet); 2311 fOtherSet->removeAll(*fLFSet); 2312 fOtherSet->removeAll(*fNewlineSet); 2313 fOtherSet->removeAll(*fKatakanaSet); 2314 fOtherSet->removeAll(*fHebrew_LetterSet); 2315 fOtherSet->removeAll(*fALetterSet); 2316 fOtherSet->removeAll(*fSingle_QuoteSet); 2317 fOtherSet->removeAll(*fDouble_QuoteSet); 2318 fOtherSet->removeAll(*fMidLetterSet); 2319 fOtherSet->removeAll(*fMidNumSet); 2320 fOtherSet->removeAll(*fNumericSet); 2321 fOtherSet->removeAll(*fExtendNumLetSet); 2322 fOtherSet->removeAll(*fFormatSet); 2323 fOtherSet->removeAll(*fExtendSet); 2324 fOtherSet->removeAll(*fRegionalIndicatorSet); 2325 // Inhibit dictionary characters from being tested at all. 2326 fOtherSet->removeAll(*fDictionaryCjkSet); 2327 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status)); 2328 2329 fSets->addElement(fCRSet, status); 2330 fSets->addElement(fLFSet, status); 2331 fSets->addElement(fNewlineSet, status); 2332 fSets->addElement(fRegionalIndicatorSet, status); 2333 fSets->addElement(fHebrew_LetterSet, status); 2334 fSets->addElement(fALetterSet, status); 2335 fSets->addElement(fSingle_QuoteSet, status); 2336 fSets->addElement(fDouble_QuoteSet, status); 2337 //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test katakana 2338 fSets->addElement(fMidLetterSet, status); 2339 fSets->addElement(fMidNumLetSet, status); 2340 fSets->addElement(fMidNumSet, status); 2341 fSets->addElement(fNumericSet, status); 2342 fSets->addElement(fFormatSet, status); 2343 fSets->addElement(fExtendSet, status); 2344 fSets->addElement(fOtherSet, status); 2345 fSets->addElement(fExtendNumLetSet, status); 2346 2347 if (U_FAILURE(status)) { 2348 deferredStatus = status; 2349 } 2350 } 2351 2352 void RBBIWordMonkey::setText(const UnicodeString &s) { 2353 fText = &s; 2354 } 2355 2356 2357 int32_t RBBIWordMonkey::next(int32_t prevPos) { 2358 int p0, p1, p2, p3; // Indices of the significant code points around the 2359 // break position being tested. The candidate break 2360 // location is before p2. 2361 2362 int breakPos = -1; 2363 2364 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 2365 2366 if (U_FAILURE(deferredStatus)) { 2367 return -1; 2368 } 2369 2370 // Prev break at end of string. return DONE. 2371 if (prevPos >= fText->length()) { 2372 return -1; 2373 } 2374 p0 = p1 = p2 = p3 = prevPos; 2375 c3 = fText->char32At(prevPos); 2376 c0 = c1 = c2 = 0; 2377 (void)p0; // Suppress set but not used warning. 2378 2379 // Loop runs once per "significant" character position in the input text. 2380 for (;;) { 2381 // Move all of the positions forward in the input string. 2382 p0 = p1; c0 = c1; 2383 p1 = p2; c1 = c2; 2384 p2 = p3; c2 = c3; 2385 2386 // Advancd p3 by X(Extend | Format)* Rule 4 2387 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change) 2388 do { 2389 p3 = fText->moveIndex32(p3, 1); 2390 c3 = fText->char32At(p3); 2391 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { 2392 break; 2393 }; 2394 } 2395 while (fFormatSet->contains(c3) || fExtendSet->contains(c3)); 2396 2397 2398 if (p1 == p2) { 2399 // Still warming up the loop. (won't work with zero length strings, but we don't care) 2400 continue; 2401 } 2402 if (p2 == fText->length()) { 2403 // Reached end of string. Always a break position. 2404 break; 2405 } 2406 2407 // Rule (3) CR x LF 2408 // No Extend or Format characters may appear between the CR and LF, 2409 // which requires the additional check for p2 immediately following p1. 2410 // 2411 if (c1==0x0D && c2==0x0A) { 2412 continue; 2413 } 2414 2415 // Rule (3a) Break before and after newlines (including CR and LF) 2416 // 2417 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) { 2418 break; 2419 }; 2420 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { 2421 break; 2422 }; 2423 2424 // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter) 2425 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) && 2426 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) { 2427 continue; 2428 } 2429 2430 // Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter) 2431 // 2432 if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) && 2433 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) && 2434 (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) { 2435 continue; 2436 } 2437 2438 // Rule (7) (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter) 2439 if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) && 2440 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) && 2441 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) { 2442 continue; 2443 } 2444 2445 // Rule (7a) Hebrew_Letter x Single_Quote 2446 if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) { 2447 continue; 2448 } 2449 2450 // Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter 2451 if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) { 2452 continue; 2453 } 2454 2455 // Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter 2456 if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) { 2457 continue; 2458 } 2459 2460 // Rule (8) Numeric x Numeric 2461 if (fNumericSet->contains(c1) && 2462 fNumericSet->contains(c2)) { 2463 continue; 2464 } 2465 2466 // Rule (9) (ALetter | Hebrew_Letter) x Numeric 2467 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) && 2468 fNumericSet->contains(c2)) { 2469 continue; 2470 } 2471 2472 // Rule (10) Numeric x (ALetter | Hebrew_Letter) 2473 if (fNumericSet->contains(c1) && 2474 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) { 2475 continue; 2476 } 2477 2478 // Rule (11) Numeric (MidNum | MidNumLet | Single_Quote) x Numeric 2479 if (fNumericSet->contains(c0) && 2480 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) && 2481 fNumericSet->contains(c2)) { 2482 continue; 2483 } 2484 2485 // Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric 2486 if (fNumericSet->contains(c1) && 2487 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) && 2488 fNumericSet->contains(c3)) { 2489 continue; 2490 } 2491 2492 // Rule (13) Katakana x Katakana 2493 if (fKatakanaSet->contains(c1) && 2494 fKatakanaSet->contains(c2)) { 2495 continue; 2496 } 2497 2498 // Rule 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet 2499 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) || 2500 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) && 2501 fExtendNumLetSet->contains(c2)) { 2502 continue; 2503 } 2504 2505 // Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana) 2506 if (fExtendNumLetSet->contains(c1) && 2507 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) || 2508 fNumericSet->contains(c2) || fKatakanaSet->contains(c2))) { 2509 continue; 2510 } 2511 2512 // Rule 13c 2513 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) { 2514 continue; 2515 } 2516 2517 // Rule 14. Break found here. 2518 break; 2519 } 2520 2521 breakPos = p2; 2522 return breakPos; 2523 } 2524 2525 2526 UVector *RBBIWordMonkey::charClasses() { 2527 return fSets; 2528 } 2529 2530 2531 RBBIWordMonkey::~RBBIWordMonkey() { 2532 delete fSets; 2533 delete fCRSet; 2534 delete fLFSet; 2535 delete fNewlineSet; 2536 delete fKatakanaSet; 2537 delete fHebrew_LetterSet; 2538 delete fALetterSet; 2539 delete fSingle_QuoteSet; 2540 delete fDouble_QuoteSet; 2541 delete fMidNumLetSet; 2542 delete fMidLetterSet; 2543 delete fMidNumSet; 2544 delete fNumericSet; 2545 delete fFormatSet; 2546 delete fExtendSet; 2547 delete fExtendNumLetSet; 2548 delete fRegionalIndicatorSet; 2549 delete fDictionaryCjkSet; 2550 delete fOtherSet; 2551 } 2552 2553 2554 2555 2556 //------------------------------------------------------------------------------------------ 2557 // 2558 // class RBBISentMonkey Sentence Break specific implementation 2559 // of RBBIMonkeyKind. 2560 // 2561 //------------------------------------------------------------------------------------------ 2562 class RBBISentMonkey: public RBBIMonkeyKind { 2563 public: 2564 RBBISentMonkey(); 2565 virtual ~RBBISentMonkey(); 2566 virtual UVector *charClasses(); 2567 virtual void setText(const UnicodeString &s); 2568 virtual int32_t next(int32_t i); 2569 private: 2570 int moveBack(int posFrom); 2571 int moveForward(int posFrom); 2572 UChar32 cAt(int pos); 2573 2574 UVector *fSets; 2575 2576 UnicodeSet *fSepSet; 2577 UnicodeSet *fFormatSet; 2578 UnicodeSet *fSpSet; 2579 UnicodeSet *fLowerSet; 2580 UnicodeSet *fUpperSet; 2581 UnicodeSet *fOLetterSet; 2582 UnicodeSet *fNumericSet; 2583 UnicodeSet *fATermSet; 2584 UnicodeSet *fSContinueSet; 2585 UnicodeSet *fSTermSet; 2586 UnicodeSet *fCloseSet; 2587 UnicodeSet *fOtherSet; 2588 UnicodeSet *fExtendSet; 2589 2590 const UnicodeString *fText; 2591 2592 }; 2593 2594 RBBISentMonkey::RBBISentMonkey() 2595 { 2596 UErrorCode status = U_ZERO_ERROR; 2597 2598 fSets = new UVector(status); 2599 2600 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator 2601 // set and made into character classes of their own. For the monkey impl, 2602 // they remain in SEP, since Sep always appears with CR and LF in the rules. 2603 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status); 2604 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status); 2605 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status); 2606 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status); 2607 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status); 2608 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status); 2609 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status); 2610 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status); 2611 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status); 2612 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status); 2613 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status); 2614 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status); 2615 fOtherSet = new UnicodeSet(); 2616 2617 if(U_FAILURE(status)) { 2618 deferredStatus = status; 2619 return; 2620 } 2621 2622 fOtherSet->complement(); 2623 fOtherSet->removeAll(*fSepSet); 2624 fOtherSet->removeAll(*fFormatSet); 2625 fOtherSet->removeAll(*fSpSet); 2626 fOtherSet->removeAll(*fLowerSet); 2627 fOtherSet->removeAll(*fUpperSet); 2628 fOtherSet->removeAll(*fOLetterSet); 2629 fOtherSet->removeAll(*fNumericSet); 2630 fOtherSet->removeAll(*fATermSet); 2631 fOtherSet->removeAll(*fSContinueSet); 2632 fOtherSet->removeAll(*fSTermSet); 2633 fOtherSet->removeAll(*fCloseSet); 2634 fOtherSet->removeAll(*fExtendSet); 2635 2636 fSets->addElement(fSepSet, status); 2637 fSets->addElement(fFormatSet, status); 2638 fSets->addElement(fSpSet, status); 2639 fSets->addElement(fLowerSet, status); 2640 fSets->addElement(fUpperSet, status); 2641 fSets->addElement(fOLetterSet, status); 2642 fSets->addElement(fNumericSet, status); 2643 fSets->addElement(fATermSet, status); 2644 fSets->addElement(fSContinueSet, status); 2645 fSets->addElement(fSTermSet, status); 2646 fSets->addElement(fCloseSet, status); 2647 fSets->addElement(fOtherSet, status); 2648 fSets->addElement(fExtendSet, status); 2649 2650 if (U_FAILURE(status)) { 2651 deferredStatus = status; 2652 } 2653 } 2654 2655 2656 2657 void RBBISentMonkey::setText(const UnicodeString &s) { 2658 fText = &s; 2659 } 2660 2661 UVector *RBBISentMonkey::charClasses() { 2662 return fSets; 2663 } 2664 2665 2666 // moveBack() Find the "significant" code point preceding the index i. 2667 // Skips over ($Extend | $Format)* . 2668 // 2669 int RBBISentMonkey::moveBack(int i) { 2670 if (i <= 0) { 2671 return -1; 2672 } 2673 UChar32 c; 2674 int32_t j = i; 2675 do { 2676 j = fText->moveIndex32(j, -1); 2677 c = fText->char32At(j); 2678 } 2679 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c))); 2680 return j; 2681 2682 } 2683 2684 2685 int RBBISentMonkey::moveForward(int i) { 2686 if (i>=fText->length()) { 2687 return fText->length(); 2688 } 2689 UChar32 c; 2690 int32_t j = i; 2691 do { 2692 j = fText->moveIndex32(j, 1); 2693 c = cAt(j); 2694 } 2695 while (fFormatSet->contains(c) || fExtendSet->contains(c)); 2696 return j; 2697 } 2698 2699 UChar32 RBBISentMonkey::cAt(int pos) { 2700 if (pos<0 || pos>=fText->length()) { 2701 return -1; 2702 } else { 2703 return fText->char32At(pos); 2704 } 2705 } 2706 2707 int32_t RBBISentMonkey::next(int32_t prevPos) { 2708 int p0, p1, p2, p3; // Indices of the significant code points around the 2709 // break position being tested. The candidate break 2710 // location is before p2. 2711 2712 int breakPos = -1; 2713 2714 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 2715 UChar32 c; 2716 2717 if (U_FAILURE(deferredStatus)) { 2718 return -1; 2719 } 2720 2721 // Prev break at end of string. return DONE. 2722 if (prevPos >= fText->length()) { 2723 return -1; 2724 } 2725 p0 = p1 = p2 = p3 = prevPos; 2726 c3 = fText->char32At(prevPos); 2727 c0 = c1 = c2 = 0; 2728 (void)p0; // Suppress set but not used warning. 2729 2730 // Loop runs once per "significant" character position in the input text. 2731 for (;;) { 2732 // Move all of the positions forward in the input string. 2733 p0 = p1; c0 = c1; 2734 p1 = p2; c1 = c2; 2735 p2 = p3; c2 = c3; 2736 2737 // Advancd p3 by X(Extend | Format)* Rule 4 2738 p3 = moveForward(p3); 2739 c3 = cAt(p3); 2740 2741 // Rule (3) CR x LF 2742 if (c1==0x0d && c2==0x0a && p2==(p1+1)) { 2743 continue; 2744 } 2745 2746 // Rule (4). Sep <break> 2747 if (fSepSet->contains(c1)) { 2748 p2 = p1+1; // Separators don't combine with Extend or Format. 2749 break; 2750 } 2751 2752 if (p2 >= fText->length()) { 2753 // Reached end of string. Always a break position. 2754 break; 2755 } 2756 2757 if (p2 == prevPos) { 2758 // Still warming up the loop. (won't work with zero length strings, but we don't care) 2759 continue; 2760 } 2761 2762 // Rule (6). ATerm x Numeric 2763 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) { 2764 continue; 2765 } 2766 2767 // Rule (7). (Upper | Lower) ATerm x Uppper 2768 if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) && 2769 fATermSet->contains(c1) && fUpperSet->contains(c2)) { 2770 continue; 2771 } 2772 2773 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower 2774 // Note: STerm | ATerm are added to the negated part of the expression by a 2775 // note to the Unicode 5.0 documents. 2776 int p8 = p1; 2777 while (fSpSet->contains(cAt(p8))) { 2778 p8 = moveBack(p8); 2779 } 2780 while (fCloseSet->contains(cAt(p8))) { 2781 p8 = moveBack(p8); 2782 } 2783 if (fATermSet->contains(cAt(p8))) { 2784 p8=p2; 2785 for (;;) { 2786 c = cAt(p8); 2787 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) || 2788 fLowerSet->contains(c) || fSepSet->contains(c) || 2789 fATermSet->contains(c) || fSTermSet->contains(c)) { 2790 break; 2791 } 2792 p8 = moveForward(p8); 2793 } 2794 if (fLowerSet->contains(cAt(p8))) { 2795 continue; 2796 } 2797 } 2798 2799 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm); 2800 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) { 2801 p8 = p1; 2802 while (fSpSet->contains(cAt(p8))) { 2803 p8 = moveBack(p8); 2804 } 2805 while (fCloseSet->contains(cAt(p8))) { 2806 p8 = moveBack(p8); 2807 } 2808 c = cAt(p8); 2809 if (fSTermSet->contains(c) || fATermSet->contains(c)) { 2810 continue; 2811 } 2812 } 2813 2814 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF) 2815 int p9 = p1; 2816 while (fCloseSet->contains(cAt(p9))) { 2817 p9 = moveBack(p9); 2818 } 2819 c = cAt(p9); 2820 if ((fSTermSet->contains(c) || fATermSet->contains(c))) { 2821 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) { 2822 continue; 2823 } 2824 } 2825 2826 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF) 2827 int p10 = p1; 2828 while (fSpSet->contains(cAt(p10))) { 2829 p10 = moveBack(p10); 2830 } 2831 while (fCloseSet->contains(cAt(p10))) { 2832 p10 = moveBack(p10); 2833 } 2834 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) { 2835 if (fSpSet->contains(c2) || fSepSet->contains(c2)) { 2836 continue; 2837 } 2838 } 2839 2840 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break> 2841 int p11 = p1; 2842 if (fSepSet->contains(cAt(p11))) { 2843 p11 = moveBack(p11); 2844 } 2845 while (fSpSet->contains(cAt(p11))) { 2846 p11 = moveBack(p11); 2847 } 2848 while (fCloseSet->contains(cAt(p11))) { 2849 p11 = moveBack(p11); 2850 } 2851 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) { 2852 break; 2853 } 2854 2855 // Rule (12) Any x Any 2856 continue; 2857 } 2858 breakPos = p2; 2859 return breakPos; 2860 } 2861 2862 RBBISentMonkey::~RBBISentMonkey() { 2863 delete fSets; 2864 delete fSepSet; 2865 delete fFormatSet; 2866 delete fSpSet; 2867 delete fLowerSet; 2868 delete fUpperSet; 2869 delete fOLetterSet; 2870 delete fNumericSet; 2871 delete fATermSet; 2872 delete fSContinueSet; 2873 delete fSTermSet; 2874 delete fCloseSet; 2875 delete fOtherSet; 2876 delete fExtendSet; 2877 } 2878 2879 2880 2881 //------------------------------------------------------------------------------------------- 2882 // 2883 // RBBILineMonkey 2884 // 2885 //------------------------------------------------------------------------------------------- 2886 2887 class RBBILineMonkey: public RBBIMonkeyKind { 2888 public: 2889 RBBILineMonkey(); 2890 virtual ~RBBILineMonkey(); 2891 virtual UVector *charClasses(); 2892 virtual void setText(const UnicodeString &s); 2893 virtual int32_t next(int32_t i); 2894 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar); 2895 private: 2896 UVector *fSets; 2897 2898 UnicodeSet *fBK; 2899 UnicodeSet *fCR; 2900 UnicodeSet *fLF; 2901 UnicodeSet *fCM; 2902 UnicodeSet *fNL; 2903 UnicodeSet *fSG; 2904 UnicodeSet *fWJ; 2905 UnicodeSet *fZW; 2906 UnicodeSet *fGL; 2907 UnicodeSet *fCB; 2908 UnicodeSet *fSP; 2909 UnicodeSet *fB2; 2910 UnicodeSet *fBA; 2911 UnicodeSet *fBB; 2912 UnicodeSet *fHY; 2913 UnicodeSet *fH2; 2914 UnicodeSet *fH3; 2915 UnicodeSet *fCL; 2916 UnicodeSet *fCP; 2917 UnicodeSet *fEX; 2918 UnicodeSet *fIN; 2919 UnicodeSet *fJL; 2920 UnicodeSet *fJV; 2921 UnicodeSet *fJT; 2922 UnicodeSet *fNS; 2923 UnicodeSet *fOP; 2924 UnicodeSet *fQU; 2925 UnicodeSet *fIS; 2926 UnicodeSet *fNU; 2927 UnicodeSet *fPO; 2928 UnicodeSet *fPR; 2929 UnicodeSet *fSY; 2930 UnicodeSet *fAI; 2931 UnicodeSet *fAL; 2932 UnicodeSet *fCJ; 2933 UnicodeSet *fHL; 2934 UnicodeSet *fID; 2935 UnicodeSet *fRI; 2936 UnicodeSet *fSA; 2937 UnicodeSet *fXX; 2938 2939 BreakIterator *fCharBI; 2940 const UnicodeString *fText; 2941 RegexMatcher *fNumberMatcher; 2942 }; 2943 2944 2945 RBBILineMonkey::RBBILineMonkey() 2946 { 2947 UErrorCode status = U_ZERO_ERROR; 2948 2949 fSets = new UVector(status); 2950 2951 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status); 2952 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status); 2953 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status); 2954 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status); 2955 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status); 2956 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status); 2957 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status); 2958 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status); 2959 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status); 2960 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status); 2961 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status); 2962 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status); 2963 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status); 2964 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status); 2965 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status); 2966 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status); 2967 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status); 2968 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status); 2969 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status); 2970 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status); 2971 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status); 2972 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status); 2973 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status); 2974 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status); 2975 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status); 2976 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status); 2977 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status); 2978 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status); 2979 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status); 2980 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status); 2981 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status); 2982 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status); 2983 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status); 2984 fCJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status); 2985 fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status); 2986 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status); 2987 fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status); 2988 fSA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status); 2989 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status); 2990 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status); 2991 2992 if (U_FAILURE(status)) { 2993 deferredStatus = status; 2994 fCharBI = NULL; 2995 fNumberMatcher = NULL; 2996 return; 2997 } 2998 2999 fAL->addAll(*fXX); // Default behavior for XX is identical to AL 3000 fAL->addAll(*fAI); // Default behavior for AI is identical to AL 3001 fAL->addAll(*fSA); // Default behavior for SA is XX, which defaults to AL 3002 fAL->addAll(*fSG); // Default behavior for SG is identical to AL. 3003 3004 fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS. 3005 3006 fSets->addElement(fBK, status); 3007 fSets->addElement(fCR, status); 3008 fSets->addElement(fLF, status); 3009 fSets->addElement(fCM, status); 3010 fSets->addElement(fNL, status); 3011 fSets->addElement(fWJ, status); 3012 fSets->addElement(fZW, status); 3013 fSets->addElement(fGL, status); 3014 fSets->addElement(fCB, status); 3015 fSets->addElement(fSP, status); 3016 fSets->addElement(fB2, status); 3017 fSets->addElement(fBA, status); 3018 fSets->addElement(fBB, status); 3019 fSets->addElement(fHY, status); 3020 fSets->addElement(fH2, status); 3021 fSets->addElement(fH3, status); 3022 fSets->addElement(fCL, status); 3023 fSets->addElement(fCP, status); 3024 fSets->addElement(fEX, status); 3025 fSets->addElement(fIN, status); 3026 fSets->addElement(fJL, status); 3027 fSets->addElement(fJT, status); 3028 fSets->addElement(fJV, status); 3029 fSets->addElement(fNS, status); 3030 fSets->addElement(fOP, status); 3031 fSets->addElement(fQU, status); 3032 fSets->addElement(fIS, status); 3033 fSets->addElement(fNU, status); 3034 fSets->addElement(fPO, status); 3035 fSets->addElement(fPR, status); 3036 fSets->addElement(fSY, status); 3037 fSets->addElement(fAI, status); 3038 fSets->addElement(fAL, status); 3039 fSets->addElement(fHL, status); 3040 fSets->addElement(fID, status); 3041 fSets->addElement(fWJ, status); 3042 fSets->addElement(fRI, status); 3043 fSets->addElement(fSA, status); 3044 fSets->addElement(fSG, status); 3045 3046 const char *rules = 3047 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?" 3048 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?" 3049 "\\p{Line_Break=NU}\\p{Line_Break=CM}*" 3050 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*" 3051 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?" 3052 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"; 3053 3054 fNumberMatcher = new RegexMatcher( 3055 UnicodeString(rules, -1, US_INV), 0, status); 3056 3057 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status); 3058 3059 if (U_FAILURE(status)) { 3060 deferredStatus = status; 3061 } 3062 } 3063 3064 3065 void RBBILineMonkey::setText(const UnicodeString &s) { 3066 fText = &s; 3067 fCharBI->setText(s); 3068 fNumberMatcher->reset(s); 3069 } 3070 3071 // 3072 // rule9Adjust 3073 // Line Break TR rules 9 and 10 implementation. 3074 // This deals with combining marks and other sequences that 3075 // that must be treated as if they were something other than what they actually are. 3076 // 3077 // This is factored out into a separate function because it must be applied twice for 3078 // each potential break, once to the chars before the position being checked, then 3079 // again to the text following the possible break. 3080 // 3081 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) { 3082 if (pos == -1) { 3083 // Invalid initial position. Happens during the warmup iteration of the 3084 // main loop in next(). 3085 return; 3086 } 3087 3088 int32_t nPos = *nextPos; 3089 3090 // LB 9 Keep combining sequences together. 3091 // advance over any CM class chars. Note that Line Break CM is different 3092 // from the normal Grapheme Extend property. 3093 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d || 3094 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) { 3095 for (;;) { 3096 *nextChar = fText->char32At(nPos); 3097 if (!fCM->contains(*nextChar)) { 3098 break; 3099 } 3100 nPos = fText->moveIndex32(nPos, 1); 3101 } 3102 } 3103 3104 3105 // LB 9 Treat X CM* as if it were x. 3106 // No explicit action required. 3107 3108 // LB 10 Treat any remaining combining mark as AL 3109 if (fCM->contains(*posChar)) { 3110 *posChar = 0x41; // thisChar = 'A'; 3111 } 3112 3113 // Push the updated nextPos and nextChar back to our caller. 3114 // This only makes a difference if posChar got bigger by consuming a 3115 // combining sequence. 3116 *nextPos = nPos; 3117 *nextChar = fText->char32At(nPos); 3118 } 3119 3120 3121 3122 int32_t RBBILineMonkey::next(int32_t startPos) { 3123 UErrorCode status = U_ZERO_ERROR; 3124 int32_t pos; // Index of the char following a potential break position 3125 UChar32 thisChar; // Character at above position "pos" 3126 3127 int32_t prevPos; // Index of the char preceding a potential break position 3128 UChar32 prevChar; // Character at above position. Note that prevChar 3129 // and thisChar may not be adjacent because combining 3130 // characters between them will be ignored. 3131 3132 int32_t prevPosX2; // Second previous character. Wider context for LB21a. 3133 UChar32 prevCharX2; 3134 3135 int32_t nextPos; // Index of the next character following pos. 3136 // Usually skips over combining marks. 3137 int32_t nextCPPos; // Index of the code point following "pos." 3138 // May point to a combining mark. 3139 int32_t tPos; // temp value. 3140 UChar32 c; 3141 3142 if (U_FAILURE(deferredStatus)) { 3143 return -1; 3144 } 3145 3146 if (startPos >= fText->length()) { 3147 return -1; 3148 } 3149 3150 3151 // Initial values for loop. Loop will run the first time without finding breaks, 3152 // while the invalid values shift out and the "this" and 3153 // "prev" positions are filled in with good values. 3154 pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration. 3155 thisChar = prevChar = prevCharX2 = 0; 3156 nextPos = nextCPPos = startPos; 3157 3158 3159 // Loop runs once per position in the test text, until a break position 3160 // is found. 3161 for (;;) { 3162 prevPosX2 = prevPos; 3163 prevCharX2 = prevChar; 3164 3165 prevPos = pos; 3166 prevChar = thisChar; 3167 3168 pos = nextPos; 3169 thisChar = fText->char32At(pos); 3170 3171 nextCPPos = fText->moveIndex32(pos, 1); 3172 nextPos = nextCPPos; 3173 3174 // Rule LB2 - Break at end of text. 3175 if (pos >= fText->length()) { 3176 break; 3177 } 3178 3179 // Rule LB 9 - adjust for combining sequences. 3180 // We do this one out-of-order because the adjustment does not change anything 3181 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to 3182 // be applied. 3183 rule9Adjust(prevPos, &prevChar, &pos, &thisChar); 3184 nextCPPos = nextPos = fText->moveIndex32(pos, 1); 3185 c = fText->char32At(nextPos); 3186 rule9Adjust(pos, &thisChar, &nextPos, &c); 3187 3188 // If the loop is still warming up - if we haven't shifted the initial 3189 // -1 positions out of prevPos yet - loop back to advance the 3190 // position in the input without any further looking for breaks. 3191 if (prevPos == -1) { 3192 continue; 3193 } 3194 3195 // LB 4 Always break after hard line breaks, 3196 if (fBK->contains(prevChar)) { 3197 break; 3198 } 3199 3200 // LB 5 Break after CR, LF, NL, but not inside CR LF 3201 if (prevChar == 0x0d && thisChar == 0x0a) { 3202 continue; 3203 } 3204 if (prevChar == 0x0d || 3205 prevChar == 0x0a || 3206 prevChar == 0x85) { 3207 break; 3208 } 3209 3210 // LB 6 Don't break before hard line breaks 3211 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 || 3212 fBK->contains(thisChar)) { 3213 continue; 3214 } 3215 3216 3217 // LB 7 Don't break before spaces or zero-width space. 3218 if (fSP->contains(thisChar)) { 3219 continue; 3220 } 3221 3222 if (fZW->contains(thisChar)) { 3223 continue; 3224 } 3225 3226 // LB 8 Break after zero width space 3227 if (fZW->contains(prevChar)) { 3228 break; 3229 } 3230 3231 // LB 9, 10 Already done, at top of loop. 3232 // 3233 3234 3235 // LB 11 Do not break before or after WORD JOINER and related characters. 3236 // x WJ 3237 // WJ x 3238 // 3239 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) { 3240 continue; 3241 } 3242 3243 // LB 12 3244 // GL x 3245 if (fGL->contains(prevChar)) { 3246 continue; 3247 } 3248 3249 // LB 12a 3250 // [^SP BA HY] x GL 3251 if (!(fSP->contains(prevChar) || 3252 fBA->contains(prevChar) || 3253 fHY->contains(prevChar) ) && fGL->contains(thisChar)) { 3254 continue; 3255 } 3256 3257 3258 3259 // LB 13 Don't break before closings. 3260 // NU x CL, NU x CP and NU x IS are not matched here so that they will 3261 // fall into LB 17 and the more general number regular expression. 3262 // 3263 if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) || 3264 (!fNU->contains(prevChar) && fCP->contains(thisChar)) || 3265 fEX->contains(thisChar) || 3266 (!fNU->contains(prevChar) && fIS->contains(thisChar)) || 3267 (!fNU->contains(prevChar) && fSY->contains(thisChar))) { 3268 continue; 3269 } 3270 3271 // LB 14 Don't break after OP SP* 3272 // Scan backwards, checking for this sequence. 3273 // The OP char could include combining marks, so we actually check for 3274 // OP CM* SP* 3275 // Another Twist: The Rule 67 fixes may have changed a SP CM 3276 // sequence into a ID char, so before scanning back through spaces, 3277 // verify that prevChar is indeed a space. The prevChar variable 3278 // may differ from fText[prevPos] 3279 tPos = prevPos; 3280 if (fSP->contains(prevChar)) { 3281 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) { 3282 tPos=fText->moveIndex32(tPos, -1); 3283 } 3284 } 3285 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { 3286 tPos=fText->moveIndex32(tPos, -1); 3287 } 3288 if (fOP->contains(fText->char32At(tPos))) { 3289 continue; 3290 } 3291 3292 3293 // LB 15 QU SP* x OP 3294 if (fOP->contains(thisChar)) { 3295 // Scan backwards from prevChar to see if it is preceded by QU CM* SP* 3296 int tPos = prevPos; 3297 while (tPos>0 && fSP->contains(fText->char32At(tPos))) { 3298 tPos = fText->moveIndex32(tPos, -1); 3299 } 3300 while (tPos>0 && fCM->contains(fText->char32At(tPos))) { 3301 tPos = fText->moveIndex32(tPos, -1); 3302 } 3303 if (fQU->contains(fText->char32At(tPos))) { 3304 continue; 3305 } 3306 } 3307 3308 3309 3310 // LB 16 (CL | CP) SP* x NS 3311 // Scan backwards for SP* CM* (CL | CP) 3312 if (fNS->contains(thisChar)) { 3313 int tPos = prevPos; 3314 while (tPos>0 && fSP->contains(fText->char32At(tPos))) { 3315 tPos = fText->moveIndex32(tPos, -1); 3316 } 3317 while (tPos>0 && fCM->contains(fText->char32At(tPos))) { 3318 tPos = fText->moveIndex32(tPos, -1); 3319 } 3320 if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) { 3321 continue; 3322 } 3323 } 3324 3325 3326 // LB 17 B2 SP* x B2 3327 if (fB2->contains(thisChar)) { 3328 // Scan backwards, checking for the B2 CM* SP* sequence. 3329 tPos = prevPos; 3330 if (fSP->contains(prevChar)) { 3331 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) { 3332 tPos=fText->moveIndex32(tPos, -1); 3333 } 3334 } 3335 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { 3336 tPos=fText->moveIndex32(tPos, -1); 3337 } 3338 if (fB2->contains(fText->char32At(tPos))) { 3339 continue; 3340 } 3341 } 3342 3343 3344 // LB 18 break after space 3345 if (fSP->contains(prevChar)) { 3346 break; 3347 } 3348 3349 // LB 19 3350 // x QU 3351 // QU x 3352 if (fQU->contains(thisChar) || fQU->contains(prevChar)) { 3353 continue; 3354 } 3355 3356 // LB 20 Break around a CB 3357 if (fCB->contains(thisChar) || fCB->contains(prevChar)) { 3358 break; 3359 } 3360 3361 // LB 21 3362 if (fBA->contains(thisChar) || 3363 fHY->contains(thisChar) || 3364 fNS->contains(thisChar) || 3365 fBB->contains(prevChar) ) { 3366 continue; 3367 } 3368 3369 // LB 21a 3370 // HL (HY | BA) x 3371 if (fHL->contains(prevCharX2) && 3372 (fHY->contains(prevChar) || fBA->contains(prevChar))) { 3373 continue; 3374 } 3375 3376 // LB 21b 3377 // SY x HL 3378 if (fSY->contains(prevChar) && fHL->contains(thisChar)) { 3379 continue; 3380 } 3381 3382 // LB 22 3383 if ((fAL->contains(prevChar) && fIN->contains(thisChar)) || 3384 (fEX->contains(prevChar) && fIN->contains(thisChar)) || 3385 (fHL->contains(prevChar) && fIN->contains(thisChar)) || 3386 (fID->contains(prevChar) && fIN->contains(thisChar)) || 3387 (fIN->contains(prevChar) && fIN->contains(thisChar)) || 3388 (fNU->contains(prevChar) && fIN->contains(thisChar)) ) { 3389 continue; 3390 } 3391 3392 3393 // LB 23 ID x PO 3394 // AL x NU 3395 // HL x NU 3396 // NU x AL 3397 if ((fID->contains(prevChar) && fPO->contains(thisChar)) || 3398 (fAL->contains(prevChar) && fNU->contains(thisChar)) || 3399 (fHL->contains(prevChar) && fNU->contains(thisChar)) || 3400 (fNU->contains(prevChar) && fAL->contains(thisChar)) || 3401 (fNU->contains(prevChar) && fHL->contains(thisChar)) ) { 3402 continue; 3403 } 3404 3405 // LB 24 Do not break between prefix and letters or ideographs. 3406 // PR x ID 3407 // PR x (AL | HL) 3408 // PO x (AL | HL) 3409 if ((fPR->contains(prevChar) && fID->contains(thisChar)) || 3410 (fPR->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) || 3411 (fPO->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar)))) { 3412 continue; 3413 } 3414 3415 3416 3417 // LB 25 Numbers 3418 if (fNumberMatcher->lookingAt(prevPos, status)) { 3419 if (U_FAILURE(status)) { 3420 break; 3421 } 3422 // Matched a number. But could have been just a single digit, which would 3423 // not represent a "no break here" between prevChar and thisChar 3424 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num 3425 if (numEndIdx > pos) { 3426 // Number match includes at least our two chars being checked 3427 if (numEndIdx > nextPos) { 3428 // Number match includes additional chars. Update pos and nextPos 3429 // so that next loop iteration will continue at the end of the number, 3430 // checking for breaks between last char in number & whatever follows. 3431 pos = nextPos = numEndIdx; 3432 do { 3433 pos = fText->moveIndex32(pos, -1); 3434 thisChar = fText->char32At(pos); 3435 } while (fCM->contains(thisChar)); 3436 } 3437 continue; 3438 } 3439 } 3440 3441 3442 // LB 26 Do not break a Korean syllable. 3443 if (fJL->contains(prevChar) && (fJL->contains(thisChar) || 3444 fJV->contains(thisChar) || 3445 fH2->contains(thisChar) || 3446 fH3->contains(thisChar))) { 3447 continue; 3448 } 3449 3450 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) && 3451 (fJV->contains(thisChar) || fJT->contains(thisChar))) { 3452 continue; 3453 } 3454 3455 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) && 3456 fJT->contains(thisChar)) { 3457 continue; 3458 } 3459 3460 // LB 27 Treat a Korean Syllable Block the same as ID. 3461 if ((fJL->contains(prevChar) || fJV->contains(prevChar) || 3462 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) && 3463 fIN->contains(thisChar)) { 3464 continue; 3465 } 3466 if ((fJL->contains(prevChar) || fJV->contains(prevChar) || 3467 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) && 3468 fPO->contains(thisChar)) { 3469 continue; 3470 } 3471 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) || 3472 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) { 3473 continue; 3474 } 3475 3476 3477 3478 // LB 28 Do not break between alphabetics ("at"). 3479 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) { 3480 continue; 3481 } 3482 3483 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g."). 3484 if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) { 3485 continue; 3486 } 3487 3488 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation. 3489 // (AL | NU) x OP 3490 // CP x (AL | NU) 3491 if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) { 3492 continue; 3493 } 3494 if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) { 3495 continue; 3496 } 3497 3498 // LB30a Do not break between regional indicators. 3499 // RI x RI 3500 if (fRI->contains(prevChar) && fRI->contains(thisChar)) { 3501 continue; 3502 } 3503 3504 // LB 31 Break everywhere else 3505 break; 3506 3507 } 3508 3509 return pos; 3510 } 3511 3512 3513 UVector *RBBILineMonkey::charClasses() { 3514 return fSets; 3515 } 3516 3517 3518 RBBILineMonkey::~RBBILineMonkey() { 3519 delete fSets; 3520 3521 delete fBK; 3522 delete fCR; 3523 delete fLF; 3524 delete fCM; 3525 delete fNL; 3526 delete fWJ; 3527 delete fZW; 3528 delete fGL; 3529 delete fCB; 3530 delete fSP; 3531 delete fB2; 3532 delete fBA; 3533 delete fBB; 3534 delete fHY; 3535 delete fH2; 3536 delete fH3; 3537 delete fCL; 3538 delete fCP; 3539 delete fEX; 3540 delete fIN; 3541 delete fJL; 3542 delete fJV; 3543 delete fJT; 3544 delete fNS; 3545 delete fOP; 3546 delete fQU; 3547 delete fIS; 3548 delete fNU; 3549 delete fPO; 3550 delete fPR; 3551 delete fSY; 3552 delete fAI; 3553 delete fAL; 3554 delete fCJ; 3555 delete fHL; 3556 delete fID; 3557 delete fRI; 3558 delete fSA; 3559 delete fSG; 3560 delete fXX; 3561 3562 delete fCharBI; 3563 delete fNumberMatcher; 3564 } 3565 3566 3567 //------------------------------------------------------------------------------------------- 3568 // 3569 // TestMonkey 3570 // 3571 // params 3572 // seed=nnnnn Random number starting seed. 3573 // Setting the seed allows errors to be reproduced. 3574 // loop=nnn Looping count. Controls running time. 3575 // -1: run forever. 3576 // 0 or greater: run length. 3577 // 3578 // type = char | word | line | sent | title 3579 // 3580 //------------------------------------------------------------------------------------------- 3581 3582 static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) { 3583 int32_t val = defaultVal; 3584 name.append(" *= *(-?\\d+)"); 3585 UErrorCode status = U_ZERO_ERROR; 3586 RegexMatcher m(name, params, 0, status); 3587 if (m.find()) { 3588 // The param exists. Convert the string to an int. 3589 char valString[100]; 3590 int32_t paramLength = m.end(1, status) - m.start(1, status); 3591 if (paramLength >= (int32_t)(sizeof(valString)-1)) { 3592 paramLength = (int32_t)(sizeof(valString)-2); 3593 } 3594 params.extract(m.start(1, status), paramLength, valString, sizeof(valString)); 3595 val = strtol(valString, NULL, 10); 3596 3597 // Delete this parameter from the params string. 3598 m.reset(); 3599 params = m.replaceFirst("", status); 3600 } 3601 U_ASSERT(U_SUCCESS(status)); 3602 return val; 3603 } 3604 #endif 3605 3606 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 3607 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr, 3608 BreakIterator *bi, 3609 int expected[], 3610 int expectedcount) 3611 { 3612 int count = 0; 3613 int i = 0; 3614 int forward[50]; 3615 bi->setText(ustr); 3616 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) { 3617 forward[count] = i; 3618 if (count < expectedcount && expected[count] != i) { 3619 test->errln("break forward test failed: expected %d but got %d", 3620 expected[count], i); 3621 break; 3622 } 3623 count ++; 3624 } 3625 if (count != expectedcount) { 3626 printStringBreaks(ustr, expected, expectedcount); 3627 test->errln("break forward test failed: missed %d match", 3628 expectedcount - count); 3629 return; 3630 } 3631 // testing boundaries 3632 for (i = 1; i < expectedcount; i ++) { 3633 int j = expected[i - 1]; 3634 if (!bi->isBoundary(j)) { 3635 printStringBreaks(ustr, expected, expectedcount); 3636 test->errln("isBoundary() failed. Expected boundary at position %d", j); 3637 return; 3638 } 3639 for (j = expected[i - 1] + 1; j < expected[i]; j ++) { 3640 if (bi->isBoundary(j)) { 3641 printStringBreaks(ustr, expected, expectedcount); 3642 test->errln("isBoundary() failed. Not expecting boundary at position %d", j); 3643 return; 3644 } 3645 } 3646 } 3647 3648 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) { 3649 count --; 3650 if (forward[count] != i) { 3651 printStringBreaks(ustr, expected, expectedcount); 3652 test->errln("happy break test previous() failed: expected %d but got %d", 3653 forward[count], i); 3654 break; 3655 } 3656 } 3657 if (count != 0) { 3658 printStringBreaks(ustr, expected, expectedcount); 3659 test->errln("break test previous() failed: missed a match"); 3660 return; 3661 } 3662 3663 // testing preceding 3664 for (i = 0; i < expectedcount - 1; i ++) { 3665 // int j = expected[i] + 1; 3666 int j = ustr.moveIndex32(expected[i], 1); 3667 for (; j <= expected[i + 1]; j ++) { 3668 if (bi->preceding(j) != expected[i]) { 3669 printStringBreaks(ustr, expected, expectedcount); 3670 test->errln("preceding(): Not expecting boundary at position %d", j); 3671 return; 3672 } 3673 } 3674 } 3675 } 3676 #endif 3677 3678 void RBBITest::TestWordBreaks(void) 3679 { 3680 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 3681 3682 Locale locale("en"); 3683 UErrorCode status = U_ZERO_ERROR; 3684 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 3685 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 3686 // Replaced any C+J characters in a row with a random sequence of characters 3687 // of the same length to make our C+J segmentation not get in the way. 3688 static const char *strlist[] = 3689 { 3690 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d", 3691 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b", 3692 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a", 3693 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622", 3694 "\\uac00\\u3588\\u009c\\u0953\\u194b", 3695 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 3696 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e", 3697 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e", 3698 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 3699 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 3700 "\\u2027\\U000e0067\\u0a47\\u00b7", 3701 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 3702 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 3703 "\\u0589\\U000e006e\\u0a42\\U000104a5", 3704 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a", 3705 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 3706 "\\u0027\\u11af\\U000e0057\\u0602", 3707 "\\U0001d7f2\\U000e007\\u0004\\u0589", 3708 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b", 3709 "\\U0001d7f2\\U000e007d\\u0004\\u0589", 3710 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 3711 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 3712 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068", 3713 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 3714 "\\u0233\\U000e0020\\u0a69\\u0d6a", 3715 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 3716 "\\u18f4\\U000e0049\\u20e7\\u2027", 3717 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 3718 "\\ua183\\u102d\\u0bec\\u003a", 3719 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 3720 "\\u003a\\u0e57\\u0fad\\u002e", 3721 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", 3722 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", 3723 "\\U000e005d\\u2044\\u0731\\u0650\\u0061", 3724 "\\u003a\\u0664\\u00b7\\u1fba", 3725 "\\u003b\\u0027\\u00b7\\u47a3", 3726 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b", 3727 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673", 3728 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c", 3729 }; 3730 int loop; 3731 if (U_FAILURE(status)) { 3732 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 3733 return; 3734 } 3735 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 3736 // printf("looping %d\n", loop); 3737 UnicodeString ustr = CharsToUnicodeString(strlist[loop]); 3738 // RBBICharMonkey monkey; 3739 RBBIWordMonkey monkey; 3740 3741 int expected[50]; 3742 int expectedcount = 0; 3743 3744 monkey.setText(ustr); 3745 int i; 3746 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 3747 expected[expectedcount ++] = i; 3748 } 3749 3750 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 3751 } 3752 delete bi; 3753 #endif 3754 } 3755 3756 void RBBITest::TestWordBoundary(void) 3757 { 3758 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data> 3759 Locale locale("en"); 3760 UErrorCode status = U_ZERO_ERROR; 3761 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 3762 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 3763 UChar str[50]; 3764 static const char *strlist[] = 3765 { 3766 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 3767 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 3768 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 3769 "\\u2027\\U000e0067\\u0a47\\u00b7", 3770 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 3771 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 3772 "\\u0589\\U000e006e\\u0a42\\U000104a5", 3773 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a", 3774 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 3775 "\\u0027\\u11af\\U000e0057\\u0602", 3776 "\\U0001d7f2\\U000e007\\u0004\\u0589", 3777 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b", 3778 "\\U0001d7f2\\U000e007d\\u0004\\u0589", 3779 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 3780 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 3781 "\\U000e0065\\u302c\\u09ee\\U000e0068", 3782 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 3783 "\\u0233\\U000e0020\\u0a69\\u0d6a", 3784 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 3785 "\\u58f4\\U000e0049\\u20e7\\u2027", 3786 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 3787 "\\ua183\\u102d\\u0bec\\u003a", 3788 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 3789 "\\u003a\\u0e57\\u0fad\\u002e", 3790 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", 3791 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", 3792 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019", 3793 "\\u003a\\u0664\\u00b7\\u1fba", 3794 "\\u003b\\u0027\\u00b7\\u47a3", 3795 }; 3796 int loop; 3797 if (U_FAILURE(status)) { 3798 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 3799 return; 3800 } 3801 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 3802 // printf("looping %d\n", loop); 3803 u_unescape(strlist[loop], str, 20); 3804 UnicodeString ustr(str); 3805 int forward[50]; 3806 int count = 0; 3807 3808 bi->setText(ustr); 3809 int prev = 0; 3810 int i; 3811 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) { 3812 forward[count ++] = i; 3813 if (i > prev) { 3814 int j; 3815 for (j = prev + 1; j < i; j ++) { 3816 if (bi->isBoundary(j)) { 3817 printStringBreaks(ustr, forward, count); 3818 errln("happy boundary test failed: expected %d not a boundary", 3819 j); 3820 return; 3821 } 3822 } 3823 } 3824 if (!bi->isBoundary(i)) { 3825 printStringBreaks(ustr, forward, count); 3826 errln("happy boundary test failed: expected %d a boundary", 3827 i); 3828 return; 3829 } 3830 prev = i; 3831 } 3832 } 3833 delete bi; 3834 } 3835 3836 void RBBITest::TestLineBreaks(void) 3837 { 3838 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 3839 Locale locale("en"); 3840 UErrorCode status = U_ZERO_ERROR; 3841 BreakIterator *bi = BreakIterator::createLineInstance(locale, status); 3842 const int32_t STRSIZE = 50; 3843 UChar str[STRSIZE]; 3844 static const char *strlist[] = 3845 { 3846 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc", 3847 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\" 3848 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d", 3849 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\" 3850 "u2014\\U000e0105\\u118c\\u000a\\u07f8", 3851 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f", 3852 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", 3853 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4", 3854 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", 3855 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060", 3856 "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5", 3857 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f", 3858 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1", 3859 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5", 3860 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0", 3861 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc", 3862 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f", 3863 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f", 3864 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b", 3865 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085", 3866 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac", 3867 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9", 3868 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025", 3869 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763", 3870 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029", 3871 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7", 3872 "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc", 3873 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a", 3874 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945", 3875 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014", 3876 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b", 3877 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0", 3878 "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025", 3879 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d", 3880 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111", 3881 "\\u2014\\u0020\\u000a\\u17c5\\u24fc", 3882 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f", 3883 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010", 3884 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43", 3885 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb", 3886 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc", 3887 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060", 3888 "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d" 3889 "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5" 3890 "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b", 3891 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0", 3892 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07", 3893 }; 3894 int loop; 3895 TEST_ASSERT_SUCCESS(status); 3896 if (U_FAILURE(status)) { 3897 return; 3898 } 3899 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 3900 // printf("looping %d\n", loop); 3901 int32_t t = u_unescape(strlist[loop], str, STRSIZE); 3902 if (t >= STRSIZE) { 3903 TEST_ASSERT(FALSE); 3904 continue; 3905 } 3906 3907 3908 UnicodeString ustr(str); 3909 RBBILineMonkey monkey; 3910 if (U_FAILURE(monkey.deferredStatus)) { 3911 continue; 3912 } 3913 3914 const int EXPECTEDSIZE = 50; 3915 int expected[EXPECTEDSIZE]; 3916 int expectedcount = 0; 3917 3918 monkey.setText(ustr); 3919 int i; 3920 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 3921 if (expectedcount >= EXPECTEDSIZE) { 3922 TEST_ASSERT(expectedcount < EXPECTEDSIZE); 3923 return; 3924 } 3925 expected[expectedcount ++] = i; 3926 } 3927 3928 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 3929 } 3930 delete bi; 3931 #endif 3932 } 3933 3934 void RBBITest::TestSentBreaks(void) 3935 { 3936 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 3937 Locale locale("en"); 3938 UErrorCode status = U_ZERO_ERROR; 3939 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status); 3940 UChar str[200]; 3941 static const char *strlist[] = 3942 { 3943 "Now\ris\nthe\r\ntime\n\rfor\r\r", 3944 "This\n", 3945 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.", 3946 "\"Sentence ending with a quote.\" Bye.", 3947 " (This is it). Testing the sentence iterator. \"This isn't it.\"", 3948 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"", 3949 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ", 3950 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ", 3951 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ", 3952 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!", 3953 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52" 3954 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a" 3955 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f" 3956 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352", 3957 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171" 3958 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030" 3959 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b" 3960 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b" 3961 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05" 3962 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4" 3963 }; 3964 int loop; 3965 if (U_FAILURE(status)) { 3966 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 3967 return; 3968 } 3969 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 3970 u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0]))); 3971 UnicodeString ustr(str); 3972 3973 RBBISentMonkey monkey; 3974 if (U_FAILURE(monkey.deferredStatus)) { 3975 continue; 3976 } 3977 3978 const int EXPECTEDSIZE = 50; 3979 int expected[EXPECTEDSIZE]; 3980 int expectedcount = 0; 3981 3982 monkey.setText(ustr); 3983 int i; 3984 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 3985 if (expectedcount >= EXPECTEDSIZE) { 3986 TEST_ASSERT(expectedcount < EXPECTEDSIZE); 3987 return; 3988 } 3989 expected[expectedcount ++] = i; 3990 } 3991 3992 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 3993 } 3994 delete bi; 3995 #endif 3996 } 3997 3998 void RBBITest::TestMonkey(char *params) { 3999 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 4000 4001 UErrorCode status = U_ZERO_ERROR; 4002 int32_t loopCount = 500; 4003 int32_t seed = 1; 4004 UnicodeString breakType = "all"; 4005 Locale locale("en"); 4006 UBool useUText = FALSE; 4007 4008 if (quick == FALSE) { 4009 loopCount = 10000; 4010 } 4011 4012 if (params) { 4013 UnicodeString p(params); 4014 loopCount = getIntParam("loop", p, loopCount); 4015 seed = getIntParam("seed", p, seed); 4016 4017 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status); 4018 if (m.find()) { 4019 breakType = m.group(1, status); 4020 m.reset(); 4021 p = m.replaceFirst("", status); 4022 } 4023 4024 RegexMatcher u(" *utext", p, 0, status); 4025 if (u.find()) { 4026 useUText = TRUE; 4027 u.reset(); 4028 p = u.replaceFirst("", status); 4029 } 4030 4031 4032 // m.reset(p); 4033 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) { 4034 // Each option is stripped out of the option string as it is processed. 4035 // All options have been checked. The option string should have been completely emptied.. 4036 char buf[100]; 4037 p.extract(buf, sizeof(buf), NULL, status); 4038 buf[sizeof(buf)-1] = 0; 4039 errln("Unrecognized or extra parameter: %s\n", buf); 4040 return; 4041 } 4042 4043 } 4044 4045 if (breakType == "char" || breakType == "all") { 4046 RBBICharMonkey m; 4047 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 4048 if (U_SUCCESS(status)) { 4049 RunMonkey(bi, m, "char", seed, loopCount, useUText); 4050 if (breakType == "all" && useUText==FALSE) { 4051 // Also run a quick test with UText when "all" is specified 4052 RunMonkey(bi, m, "char", seed, loopCount, TRUE); 4053 } 4054 } 4055 else { 4056 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status)); 4057 } 4058 delete bi; 4059 } 4060 4061 if (breakType == "word" || breakType == "all") { 4062 logln("Word Break Monkey Test"); 4063 RBBIWordMonkey m; 4064 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 4065 if (U_SUCCESS(status)) { 4066 RunMonkey(bi, m, "word", seed, loopCount, useUText); 4067 } 4068 else { 4069 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status)); 4070 } 4071 delete bi; 4072 } 4073 4074 if (breakType == "line" || breakType == "all") { 4075 logln("Line Break Monkey Test"); 4076 RBBILineMonkey m; 4077 BreakIterator *bi = BreakIterator::createLineInstance(locale, status); 4078 if (loopCount >= 10) { 4079 loopCount = loopCount / 5; // Line break runs slower than the others. 4080 } 4081 if (U_SUCCESS(status)) { 4082 RunMonkey(bi, m, "line", seed, loopCount, useUText); 4083 } 4084 else { 4085 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); 4086 } 4087 delete bi; 4088 } 4089 4090 if (breakType == "sent" || breakType == "all" ) { 4091 logln("Sentence Break Monkey Test"); 4092 RBBISentMonkey m; 4093 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status); 4094 if (loopCount >= 10) { 4095 loopCount = loopCount / 10; // Sentence runs slower than the other break types 4096 } 4097 if (U_SUCCESS(status)) { 4098 RunMonkey(bi, m, "sentence", seed, loopCount, useUText); 4099 } 4100 else { 4101 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); 4102 } 4103 delete bi; 4104 } 4105 4106 #endif 4107 } 4108 4109 // 4110 // Run a RBBI monkey test. Common routine, for all break iterator types. 4111 // Parameters: 4112 // bi - the break iterator to use 4113 // mk - MonkeyKind, abstraction for obtaining expected results 4114 // name - Name of test (char, word, etc.) for use in error messages 4115 // seed - Seed for starting random number generator (parameter from user) 4116 // numIterations 4117 // 4118 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed, 4119 int32_t numIterations, UBool useUText) { 4120 4121 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 4122 4123 const int32_t TESTSTRINGLEN = 500; 4124 UnicodeString testText; 4125 int32_t numCharClasses; 4126 UVector *chClasses; 4127 int expected[TESTSTRINGLEN*2 + 1]; 4128 int expectedCount = 0; 4129 char expectedBreaks[TESTSTRINGLEN*2 + 1]; 4130 char forwardBreaks[TESTSTRINGLEN*2 + 1]; 4131 char reverseBreaks[TESTSTRINGLEN*2+1]; 4132 char isBoundaryBreaks[TESTSTRINGLEN*2+1]; 4133 char followingBreaks[TESTSTRINGLEN*2+1]; 4134 char precedingBreaks[TESTSTRINGLEN*2+1]; 4135 int i; 4136 int loopCount = 0; 4137 4138 m_seed = seed; 4139 4140 numCharClasses = mk.charClasses()->size(); 4141 chClasses = mk.charClasses(); 4142 4143 // Check for errors that occured during the construction of the MonkeyKind object. 4144 // Can't report them where they occured because errln() is a method coming from intlTest, 4145 // and is not visible outside of RBBITest :-( 4146 if (U_FAILURE(mk.deferredStatus)) { 4147 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus)); 4148 return; 4149 } 4150 4151 // Verify that the character classes all have at least one member. 4152 for (i=0; i<numCharClasses; i++) { 4153 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i); 4154 if (s == NULL || s->size() == 0) { 4155 errln("Character Class #%d is null or of zero size.", i); 4156 return; 4157 } 4158 } 4159 4160 while (loopCount < numIterations || numIterations == -1) { 4161 if (numIterations == -1 && loopCount % 10 == 0) { 4162 // If test is running in an infinite loop, display a periodic tic so 4163 // we can tell that it is making progress. 4164 fprintf(stderr, "."); 4165 } 4166 // Save current random number seed, so that we can recreate the random numbers 4167 // for this loop iteration in event of an error. 4168 seed = m_seed; 4169 4170 // Populate a test string with data. 4171 testText.truncate(0); 4172 for (i=0; i<TESTSTRINGLEN; i++) { 4173 int32_t aClassNum = m_rand() % numCharClasses; 4174 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum); 4175 int32_t charIdx = m_rand() % classSet->size(); 4176 UChar32 c = classSet->charAt(charIdx); 4177 if (c < 0) { // TODO: deal with sets containing strings. 4178 errln("c < 0"); 4179 break; 4180 } 4181 testText.append(c); 4182 } 4183 4184 // Calculate the expected results for this test string. 4185 mk.setText(testText); 4186 memset(expectedBreaks, 0, sizeof(expectedBreaks)); 4187 expectedBreaks[0] = 1; 4188 int32_t breakPos = 0; 4189 expectedCount = 0; 4190 for (;;) { 4191 breakPos = mk.next(breakPos); 4192 if (breakPos == -1) { 4193 break; 4194 } 4195 if (breakPos > testText.length()) { 4196 errln("breakPos > testText.length()"); 4197 } 4198 expectedBreaks[breakPos] = 1; 4199 U_ASSERT(expectedCount<testText.length()); 4200 expected[expectedCount ++] = breakPos; 4201 (void)expected; // Set but not used warning. 4202 // TODO (andy): check it out. 4203 } 4204 4205 // Find the break positions using forward iteration 4206 memset(forwardBreaks, 0, sizeof(forwardBreaks)); 4207 if (useUText) { 4208 UErrorCode status = U_ZERO_ERROR; 4209 UText *testUText = utext_openReplaceable(NULL, &testText, &status); 4210 // testUText = utext_openUnicodeString(testUText, &testText, &status); 4211 bi->setText(testUText, status); 4212 TEST_ASSERT_SUCCESS(status); 4213 utext_close(testUText); // The break iterator does a shallow clone of the UText 4214 // This UText can be closed immediately, so long as the 4215 // testText string continues to exist. 4216 } else { 4217 bi->setText(testText); 4218 } 4219 4220 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) { 4221 if (i < 0 || i > testText.length()) { 4222 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name); 4223 break; 4224 } 4225 forwardBreaks[i] = 1; 4226 } 4227 4228 // Find the break positions using reverse iteration 4229 memset(reverseBreaks, 0, sizeof(reverseBreaks)); 4230 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) { 4231 if (i < 0 || i > testText.length()) { 4232 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name); 4233 break; 4234 } 4235 reverseBreaks[i] = 1; 4236 } 4237 4238 // Find the break positions using isBoundary() tests. 4239 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks)); 4240 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length()); 4241 for (i=0; i<=testText.length(); i++) { 4242 isBoundaryBreaks[i] = bi->isBoundary(i); 4243 } 4244 4245 4246 // Find the break positions using the following() function. 4247 // printf("."); 4248 memset(followingBreaks, 0, sizeof(followingBreaks)); 4249 int32_t lastBreakPos = 0; 4250 followingBreaks[0] = 1; 4251 for (i=0; i<testText.length(); i++) { 4252 breakPos = bi->following(i); 4253 if (breakPos <= i || 4254 breakPos < lastBreakPos || 4255 breakPos > testText.length() || 4256 (breakPos > lastBreakPos && lastBreakPos > i)) { 4257 errln("%s break monkey test: " 4258 "Out of range value returned by BreakIterator::following().\n" 4259 "Random seed=%d index=%d; following returned %d; lastbreak=%d", 4260 name, seed, i, breakPos, lastBreakPos); 4261 break; 4262 } 4263 followingBreaks[breakPos] = 1; 4264 lastBreakPos = breakPos; 4265 } 4266 4267 // Find the break positions using the preceding() function. 4268 memset(precedingBreaks, 0, sizeof(precedingBreaks)); 4269 lastBreakPos = testText.length(); 4270 precedingBreaks[testText.length()] = 1; 4271 for (i=testText.length(); i>0; i--) { 4272 breakPos = bi->preceding(i); 4273 if (breakPos >= i || 4274 breakPos > lastBreakPos || 4275 (breakPos < 0 && testText.getChar32Start(i)>0) || 4276 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) { 4277 errln("%s break monkey test: " 4278 "Out of range value returned by BreakIterator::preceding().\n" 4279 "index=%d; prev returned %d; lastBreak=%d" , 4280 name, i, breakPos, lastBreakPos); 4281 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) { 4282 precedingBreaks[i] = 2; // Forces an error. 4283 } 4284 } else { 4285 if (breakPos >= 0) { 4286 precedingBreaks[breakPos] = 1; 4287 } 4288 lastBreakPos = breakPos; 4289 } 4290 } 4291 4292 // Compare the expected and actual results. 4293 for (i=0; i<=testText.length(); i++) { 4294 const char *errorType = NULL; 4295 if (forwardBreaks[i] != expectedBreaks[i]) { 4296 errorType = "next()"; 4297 } else if (reverseBreaks[i] != forwardBreaks[i]) { 4298 errorType = "previous()"; 4299 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) { 4300 errorType = "isBoundary()"; 4301 } else if (followingBreaks[i] != expectedBreaks[i]) { 4302 errorType = "following()"; 4303 } else if (precedingBreaks[i] != expectedBreaks[i]) { 4304 errorType = "preceding()"; 4305 } 4306 4307 4308 if (errorType != NULL) { 4309 // Format a range of the test text that includes the failure as 4310 // a data item that can be included in the rbbi test data file. 4311 4312 // Start of the range is the last point where expected and actual results 4313 // both agreed that there was a break position. 4314 int startContext = i; 4315 int32_t count = 0; 4316 for (;;) { 4317 if (startContext==0) { break; } 4318 startContext --; 4319 if (expectedBreaks[startContext] != 0) { 4320 if (count == 2) break; 4321 count ++; 4322 } 4323 } 4324 4325 // End of range is two expected breaks past the start position. 4326 int endContext = i + 1; 4327 int ci; 4328 for (ci=0; ci<2; ci++) { // Number of items to include in error text. 4329 for (;;) { 4330 if (endContext >= testText.length()) {break;} 4331 if (expectedBreaks[endContext-1] != 0) { 4332 if (count == 0) break; 4333 count --; 4334 } 4335 endContext ++; 4336 } 4337 } 4338 4339 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>" 4340 UnicodeString errorText = "<data>"; 4341 /***if (strcmp(errorType, "next()") == 0) { 4342 startContext = 0; 4343 endContext = testText.length(); 4344 4345 printStringBreaks(testText, expected, expectedCount); 4346 }***/ 4347 4348 for (ci=startContext; ci<endContext;) { 4349 UnicodeString hexChars("0123456789abcdef"); 4350 UChar32 c; 4351 int bn; 4352 c = testText.char32At(ci); 4353 if (ci == i) { 4354 // This is the location of the error. 4355 errorText.append("<?>"); 4356 } else if (expectedBreaks[ci] != 0) { 4357 // This a non-error expected break position. 4358 errorText.append("\\"); 4359 } 4360 if (c < 0x10000) { 4361 errorText.append("\\u"); 4362 for (bn=12; bn>=0; bn-=4) { 4363 errorText.append(hexChars.charAt((c>>bn)&0xf)); 4364 } 4365 } else { 4366 errorText.append("\\U"); 4367 for (bn=28; bn>=0; bn-=4) { 4368 errorText.append(hexChars.charAt((c>>bn)&0xf)); 4369 } 4370 } 4371 ci = testText.moveIndex32(ci, 1); 4372 } 4373 errorText.append("\\"); 4374 errorText.append("</data>\n"); 4375 4376 // Output the error 4377 char charErrorTxt[500]; 4378 UErrorCode status = U_ZERO_ERROR; 4379 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status); 4380 charErrorTxt[sizeof(charErrorTxt)-1] = 0; 4381 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status); 4382 4383 errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s", 4384 name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"), 4385 errorType, seed, i, charErrorTxt); 4386 break; 4387 } 4388 } 4389 4390 loopCount++; 4391 } 4392 #endif 4393 } 4394 4395 4396 // Bug 5532. UTF-8 based UText fails in dictionary code. 4397 // This test checks the initial patch, 4398 // which is to just keep it from crashing. Correct word boundaries 4399 // await a proper fix to the dictionary code. 4400 // 4401 void RBBITest::TestBug5532(void) { 4402 // Text includes a mixture of Thai and Latin. 4403 const unsigned char utf8Data[] = { 4404 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u, 4405 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u, 4406 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u, 4407 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 4408 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u, 4409 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u, 4410 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu, 4411 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u, 4412 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 4413 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u, 4414 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00}; 4415 4416 UErrorCode status = U_ZERO_ERROR; 4417 UText utext=UTEXT_INITIALIZER; 4418 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status); 4419 TEST_ASSERT_SUCCESS(status); 4420 4421 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status); 4422 TEST_ASSERT_SUCCESS(status); 4423 if (U_SUCCESS(status)) { 4424 bi->setText(&utext, status); 4425 TEST_ASSERT_SUCCESS(status); 4426 4427 int32_t breakCount = 0; 4428 int32_t previousBreak = -1; 4429 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) { 4430 // For now, just make sure that the break iterator doesn't hang. 4431 TEST_ASSERT(previousBreak < bi->current()); 4432 previousBreak = bi->current(); 4433 } 4434 TEST_ASSERT(breakCount > 0); 4435 } 4436 delete bi; 4437 utext_close(&utext); 4438 } 4439 4440 4441 void RBBITest::TestBug9983(void) { 4442 UnicodeString text = UnicodeString("\\u002A" // * Other 4443 "\\uFF65" // Other 4444 "\\u309C" // Katakana 4445 "\\uFF9F" // Extend 4446 "\\uFF65" // Other 4447 "\\u0020" // Other 4448 "\\u0000").unescape(); 4449 4450 UErrorCode status = U_ZERO_ERROR; 4451 LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>( 4452 BreakIterator::createWordInstance(Locale::getRoot(), status))); 4453 TEST_ASSERT_SUCCESS(status); 4454 LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>( 4455 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status))); 4456 TEST_ASSERT_SUCCESS(status); 4457 if (U_FAILURE(status)) { 4458 return; 4459 } 4460 int32_t offset, rstatus, iterationCount; 4461 4462 brkiter->setText(text); 4463 brkiter->last(); 4464 iterationCount = 0; 4465 while ( (offset = brkiter->previous()) != UBRK_DONE ) { 4466 iterationCount++; 4467 rstatus = brkiter->getRuleStatus(); 4468 (void)rstatus; // Suppress set but not used warning. 4469 if (iterationCount >= 10) { 4470 break; 4471 } 4472 } 4473 TEST_ASSERT(iterationCount == 6); 4474 4475 brkiterPOSIX->setText(text); 4476 brkiterPOSIX->last(); 4477 iterationCount = 0; 4478 while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) { 4479 iterationCount++; 4480 rstatus = brkiterPOSIX->getRuleStatus(); 4481 (void)rstatus; // Suppress set but not used warning. 4482 if (iterationCount >= 10) { 4483 break; 4484 } 4485 } 4486 TEST_ASSERT(iterationCount == 6); 4487 } 4488 4489 4490 // 4491 // TestDebug - A place-holder test for debugging purposes. 4492 // For putting in fragments of other tests that can be invoked 4493 // for tracing without a lot of unwanted extra stuff happening. 4494 // 4495 void RBBITest::TestDebug(void) { 4496 #if 0 4497 UErrorCode status = U_ZERO_ERROR; 4498 int pos = 0; 4499 int ruleStatus = 0; 4500 4501 RuleBasedBreakIterator* bi = 4502 // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); 4503 // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status); 4504 (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status); 4505 UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e"); 4506 // UnicodeString s("Aaa. Bcd"); 4507 s = s.unescape(); 4508 bi->setText(s); 4509 UBool r = bi->isBoundary(8); 4510 printf("%s", r?"true":"false"); 4511 return; 4512 pos = bi->last(); 4513 do { 4514 // ruleStatus = bi->getRuleStatus(); 4515 printf("%d\t%d\n", pos, ruleStatus); 4516 pos = bi->previous(); 4517 } while (pos != BreakIterator::DONE); 4518 #endif 4519 } 4520 4521 void RBBITest::TestProperties() { 4522 UErrorCode errorCode = U_ZERO_ERROR; 4523 UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode); 4524 if (!prependSet.isEmpty()) { 4525 errln( 4526 "[:GCB=Prepend:] is not empty any more. " 4527 "Uncomment relevant lines in source/data/brkitr/char.txt and " 4528 "change this test to the opposite condition."); 4529 } 4530 } 4531 4532 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 4533