1 /******************************************************************** 2 * COPYRIGHT: 3 * Copyright (c) 1999-2012, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ********************************************************************/ 6 /************************************************************************ 7 * Date Name Description 8 * 12/15/99 Madhu Creation. 9 * 01/12/2000 Madhu Updated for changed API and added new tests 10 ************************************************************************/ 11 12 #include "utypeinfo.h" // for 'typeid' to work 13 14 #include "unicode/utypes.h" 15 16 #if !UCONFIG_NO_BREAK_ITERATION 17 18 #include "unicode/utypes.h" 19 #include "unicode/brkiter.h" 20 #include "unicode/rbbi.h" 21 #include "unicode/uchar.h" 22 #include "unicode/utf16.h" 23 #include "unicode/ucnv.h" 24 #include "unicode/schriter.h" 25 #include "unicode/uniset.h" 26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 27 #include "unicode/regex.h" 28 #endif 29 #include "unicode/ustring.h" 30 #include "unicode/utext.h" 31 #include "intltest.h" 32 #include "rbbitst.h" 33 #include <string.h> 34 #include "uvector.h" 35 #include "uvectr32.h" 36 #include <string.h> 37 #include <stdio.h> 38 #include <stdlib.h> 39 #include "unicode/numfmt.h" 40 #include "unicode/uscript.h" 41 42 #define TEST_ASSERT(x) {if (!(x)) { \ 43 errln("Failure in file %s, line %d", __FILE__, __LINE__);}} 44 45 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \ 46 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}} 47 48 49 //--------------------------------------------- 50 // runIndexedTest 51 //--------------------------------------------- 52 53 54 // Note: Before adding new tests to this file, check whether the desired test data can 55 // simply be added to the file testdata/rbbitest.txt. In most cases it can, 56 // it's much less work than writing a new test, diagnostic output in the event of failures 57 // is good, and the test data file will is shared with ICU4J, so eventually the test 58 // will run there as well, without additional effort. 59 60 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params ) 61 { 62 if (exec) logln("TestSuite RuleBasedBreakIterator: "); 63 64 switch (index) { 65 #if !UCONFIG_NO_FILE_IO 66 case 0: name = "TestBug4153072"; 67 if(exec) TestBug4153072(); break; 68 #else 69 case 0: name = "skip"; 70 break; 71 #endif 72 73 case 1: name = "skip"; 74 break; 75 case 2: name = "TestStatusReturn"; 76 if(exec) TestStatusReturn(); break; 77 78 #if !UCONFIG_NO_FILE_IO 79 case 3: name = "TestUnicodeFiles"; 80 if(exec) TestUnicodeFiles(); break; 81 case 4: name = "TestEmptyString"; 82 if(exec) TestEmptyString(); break; 83 #else 84 case 3: case 4: name = "skip"; 85 break; 86 #endif 87 88 case 5: name = "TestGetAvailableLocales"; 89 if(exec) TestGetAvailableLocales(); break; 90 91 case 6: name = "TestGetDisplayName"; 92 if(exec) TestGetDisplayName(); break; 93 94 #if !UCONFIG_NO_FILE_IO 95 case 7: name = "TestEndBehaviour"; 96 if(exec) TestEndBehaviour(); break; 97 case 8: case 9: case 10: name = "skip"; 98 break; 99 case 11: name = "TestWordBreaks"; 100 if(exec) TestWordBreaks(); break; 101 case 12: name = "TestWordBoundary"; 102 if(exec) TestWordBoundary(); break; 103 case 13: name = "TestLineBreaks"; 104 if(exec) TestLineBreaks(); break; 105 case 14: name = "TestSentBreaks"; 106 if(exec) TestSentBreaks(); break; 107 case 15: name = "TestExtended"; 108 if(exec) TestExtended(); break; 109 #else 110 case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip"; 111 break; 112 #endif 113 114 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO 115 case 16: 116 name = "TestMonkey"; if(exec) TestMonkey(params); break; 117 #else 118 case 16: 119 name = "skip"; break; 120 #endif 121 122 #if !UCONFIG_NO_FILE_IO 123 case 17: name = "TestBug3818"; 124 if(exec) TestBug3818(); break; 125 #else 126 case 17: name = "skip"; 127 break; 128 #endif 129 130 case 18: name = "skip"; 131 break; 132 case 19: name = "TestDebug"; 133 if(exec) TestDebug(); break; 134 case 20: name = "skip"; 135 break; 136 137 #if !UCONFIG_NO_FILE_IO 138 case 21: name = "TestBug5775"; 139 if (exec) TestBug5775(); break; 140 #else 141 case 21: name = "skip"; 142 break; 143 #endif 144 145 case 22: name = "skip"; 146 break; 147 case 23: name = "TestDictRules"; 148 if (exec) TestDictRules(); break; 149 case 24: name = "TestBug5532"; 150 if (exec) TestBug5532(); break; 151 default: name = ""; break; //needed to end loop 152 } 153 } 154 155 156 //--------------------------------------------------------------------------- 157 // 158 // class BITestData Holds a set of Break iterator test data and results 159 // Includes 160 // - the string data to be broken 161 // - a vector of the expected break positions. 162 // - a vector of source line numbers for the data, 163 // (to help see where errors occured.) 164 // - The expected break tag values. 165 // - Vectors of actual break positions and tag values. 166 // - Functions for comparing actual with expected and 167 // reporting errors. 168 // 169 //---------------------------------------------------------------------------- 170 class BITestData { 171 public: 172 UnicodeString fDataToBreak; 173 UVector fExpectedBreakPositions; 174 UVector fExpectedTags; 175 UVector fLineNum; 176 UVector fActualBreakPositions; // Test Results. 177 UVector fActualTags; 178 179 BITestData(UErrorCode &status); 180 void addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status); 181 void checkResults(const char *heading, RBBITest *test); 182 void err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx); 183 void clearResults(); 184 }; 185 186 // 187 // Constructor. 188 // 189 BITestData::BITestData(UErrorCode &status) 190 : fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fActualBreakPositions(status), 191 fActualTags(status) 192 { 193 } 194 195 // 196 // addDataChunk. Add a section (non-breaking) piece if data to the test data. 197 // The macro form collects the line number, which is helpful 198 // when tracking down failures. 199 // 200 // A null data item is inserted at the start of each test's data 201 // to put the starting zero into the data list. The position saved for 202 // each non-null item is its ending position. 203 // 204 #define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status); 205 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) { 206 if (U_FAILURE(status)) {return;} 207 if (data != NULL) { 208 fDataToBreak.append(CharsToUnicodeString(data)); 209 } 210 fExpectedBreakPositions.addElement(fDataToBreak.length(), status); 211 fExpectedTags.addElement(tag, status); 212 fLineNum.addElement(lineNum, status); 213 } 214 215 216 // 217 // checkResults. Compare the actual and expected break positions, report any differences. 218 // 219 void BITestData::checkResults(const char *heading, RBBITest *test) { 220 int32_t expectedIndex = 0; 221 int32_t actualIndex = 0; 222 223 for (;;) { 224 // If we've run through both the expected and actual results vectors, we're done. 225 // break out of the loop. 226 if (expectedIndex >= fExpectedBreakPositions.size() && 227 actualIndex >= fActualBreakPositions.size()) { 228 break; 229 } 230 231 232 if (expectedIndex >= fExpectedBreakPositions.size()) { 233 err(heading, test, expectedIndex-1, actualIndex); 234 actualIndex++; 235 continue; 236 } 237 238 if (actualIndex >= fActualBreakPositions.size()) { 239 err(heading, test, expectedIndex, actualIndex-1); 240 expectedIndex++; 241 continue; 242 } 243 244 if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) { 245 err(heading, test, expectedIndex, actualIndex); 246 // Try to resync the positions of the indices, to avoid a rash of spurious erros. 247 if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) { 248 actualIndex++; 249 } else { 250 expectedIndex++; 251 } 252 continue; 253 } 254 255 if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) { 256 test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d", 257 heading, fLineNum.elementAt(expectedIndex), 258 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex)); 259 } 260 261 actualIndex++; 262 expectedIndex++; 263 } 264 } 265 266 // 267 // err - An error was found. Report it, along with information about where the 268 // incorrectly broken test data appeared in the source file. 269 // 270 void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx) 271 { 272 int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx); 273 int32_t actual = fActualBreakPositions.elementAti(actualIdx); 274 int32_t o = 0; 275 int32_t line = fLineNum.elementAti(expectedIdx); 276 if (expectedIdx > 0) { 277 // The line numbers are off by one because a premature break occurs somewhere 278 // within the previous item, rather than at the start of the current (expected) item. 279 // We want to report the offset of the unexpected break from the start of 280 // this previous item. 281 o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1); 282 } 283 if (actual < expected) { 284 test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading, o, line, actual, expected); 285 } else { 286 test->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading, line, actual, expected); 287 } 288 } 289 290 291 void BITestData::clearResults() { 292 fActualBreakPositions.removeAllElements(); 293 fActualTags.removeAllElements(); 294 } 295 296 297 //-------------------------------------------------------------------------------------- 298 // 299 // RBBITest constructor and destructor 300 // 301 //-------------------------------------------------------------------------------------- 302 303 RBBITest::RBBITest() { 304 } 305 306 307 RBBITest::~RBBITest() { 308 } 309 310 //----------------------------------------------------------------------------------- 311 // 312 // Test for status {tag} return value from break rules. 313 // TODO: a more thorough test. 314 // 315 //----------------------------------------------------------------------------------- 316 void RBBITest::TestStatusReturn() { 317 UnicodeString rulesString1("$Letters = [:L:];\n" 318 "$Numbers = [:N:];\n" 319 "$Letters+{1};\n" 320 "$Numbers+{2};\n" 321 "Help\\ {4}/me\\!;\n" 322 "[^$Letters $Numbers];\n" 323 "!.*;\n", -1, US_INV); 324 UnicodeString testString1 = "abc123..abc Help me Help me!"; 325 // 01234567890123456789012345678 326 int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1}; 327 int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1}; 328 329 UErrorCode status=U_ZERO_ERROR; 330 UParseError parseError; 331 332 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status); 333 if(U_FAILURE(status)) { 334 dataerrln("FAIL : in construction - %s", u_errorName(status)); 335 } else { 336 int32_t pos; 337 int32_t i = 0; 338 bi->setText(testString1); 339 for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) { 340 if (pos != bounds1[i]) { 341 errln("FAIL: expected break at %d, got %d\n", bounds1[i], pos); 342 break; 343 } 344 345 int tag = bi->getRuleStatus(); 346 if (tag != brkStatus[i]) { 347 errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag); 348 break; 349 } 350 i++; 351 } 352 } 353 delete bi; 354 } 355 356 357 static void printStringBreaks(UnicodeString ustr, int expected[], 358 int expectedcount) 359 { 360 UErrorCode status = U_ZERO_ERROR; 361 char name[100]; 362 printf("code alpha extend alphanum type word sent line name\n"); 363 int j; 364 for (j = 0; j < ustr.length(); j ++) { 365 if (expectedcount > 0) { 366 int k; 367 for (k = 0; k < expectedcount; k ++) { 368 if (j == expected[k]) { 369 printf("------------------------------------------------ %d\n", 370 j); 371 } 372 } 373 } 374 UChar32 c = ustr.char32At(j); 375 if (c > 0xffff) { 376 j ++; 377 } 378 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status); 379 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c, 380 u_isUAlphabetic(c), 381 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND), 382 u_isalnum(c), 383 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY, 384 u_charType(c), 385 U_SHORT_PROPERTY_NAME), 386 u_getPropertyValueName(UCHAR_WORD_BREAK, 387 u_getIntPropertyValue(c, 388 UCHAR_WORD_BREAK), 389 U_SHORT_PROPERTY_NAME), 390 u_getPropertyValueName(UCHAR_SENTENCE_BREAK, 391 u_getIntPropertyValue(c, 392 UCHAR_SENTENCE_BREAK), 393 U_SHORT_PROPERTY_NAME), 394 u_getPropertyValueName(UCHAR_LINE_BREAK, 395 u_getIntPropertyValue(c, 396 UCHAR_LINE_BREAK), 397 U_SHORT_PROPERTY_NAME), 398 name); 399 } 400 } 401 402 403 void RBBITest::TestBug3818() { 404 UErrorCode status = U_ZERO_ERROR; 405 406 // Four Thai words... 407 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 408 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 }; 409 UnicodeString thaiStr(thaiWordData); 410 411 RuleBasedBreakIterator* bi = 412 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status); 413 if (U_FAILURE(status) || bi == NULL) { 414 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status)); 415 return; 416 } 417 bi->setText(thaiStr); 418 419 int32_t startOfSecondWord = bi->following(1); 420 if (startOfSecondWord != 4) { 421 errln("Fail at file %s, line %d expected start of word at 4, got %d", 422 __FILE__, __LINE__, startOfSecondWord); 423 } 424 startOfSecondWord = bi->following(0); 425 if (startOfSecondWord != 4) { 426 errln("Fail at file %s, line %d expected start of word at 4, got %d", 427 __FILE__, __LINE__, startOfSecondWord); 428 } 429 delete bi; 430 } 431 432 //---------------------------------------------------------------------------- 433 // 434 // generalIteratorTest Given a break iterator and a set of test data, 435 // Run the tests and report the results. 436 // 437 //---------------------------------------------------------------------------- 438 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td) 439 { 440 441 bi.setText(td.fDataToBreak); 442 443 testFirstAndNext(bi, td); 444 445 testLastAndPrevious(bi, td); 446 447 testFollowing(bi, td); 448 testPreceding(bi, td); 449 testIsBoundary(bi, td); 450 doMultipleSelectionTest(bi, td); 451 } 452 453 454 // 455 // testFirstAndNext. Run the iterator forwards in the obvious first(), next() 456 // kind of loop. 457 // 458 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td) 459 { 460 UErrorCode status = U_ZERO_ERROR; 461 int32_t p; 462 int32_t lastP = -1; 463 int32_t tag; 464 465 logln("Test first and next"); 466 bi.setText(td.fDataToBreak); 467 td.clearResults(); 468 469 for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) { 470 td.fActualBreakPositions.addElement(p, status); // Save result. 471 tag = bi.getRuleStatus(); 472 td.fActualTags.addElement(tag, status); 473 if (p <= lastP) { 474 // If the iterator is not making forward progress, stop. 475 // No need to raise an error here, it'll be detected in the normal check of results. 476 break; 477 } 478 lastP = p; 479 } 480 td.checkResults("testFirstAndNext", this); 481 } 482 483 484 // 485 // TestLastAndPrevious. Run the iterator backwards, starting with last(). 486 // 487 void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td) 488 { 489 UErrorCode status = U_ZERO_ERROR; 490 int32_t p; 491 int32_t lastP = 0x7ffffffe; 492 int32_t tag; 493 494 logln("Test last and previous"); 495 bi.setText(td.fDataToBreak); 496 td.clearResults(); 497 498 for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) { 499 // Save break position. Insert it at start of vector of results, shoving 500 // already-saved results further towards the end. 501 td.fActualBreakPositions.insertElementAt(p, 0, status); 502 // bi.previous(); // TODO: Why does this fix things up???? 503 // bi.next(); 504 tag = bi.getRuleStatus(); 505 td.fActualTags.insertElementAt(tag, 0, status); 506 if (p >= lastP) { 507 // If the iterator is not making progress, stop. 508 // No need to raise an error here, it'll be detected in the normal check of results. 509 break; 510 } 511 lastP = p; 512 } 513 td.checkResults("testLastAndPrevious", this); 514 } 515 516 517 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td) 518 { 519 UErrorCode status = U_ZERO_ERROR; 520 int32_t p; 521 int32_t tag; 522 int32_t lastP = -2; // A value that will never be returned as a break position. 523 // cannot be -1; that is returned for DONE. 524 int i; 525 526 logln("testFollowing():"); 527 bi.setText(td.fDataToBreak); 528 td.clearResults(); 529 530 // Save the starting point, since we won't get that out of following. 531 p = bi.first(); 532 td.fActualBreakPositions.addElement(p, status); // Save result. 533 tag = bi.getRuleStatus(); 534 td.fActualTags.addElement(tag, status); 535 536 for (i = 0; i <= td.fDataToBreak.length()+1; i++) { 537 p = bi.following(i); 538 if (p != lastP) { 539 if (p == RuleBasedBreakIterator::DONE) { 540 break; 541 } 542 // We've reached a new break position. Save it. 543 td.fActualBreakPositions.addElement(p, status); // Save result. 544 tag = bi.getRuleStatus(); 545 td.fActualTags.addElement(tag, status); 546 lastP = p; 547 } 548 } 549 // The loop normally exits by means of the break in the middle. 550 // Make sure that the index was at the correct position for the break iterator to have 551 // returned DONE. 552 if (i != td.fDataToBreak.length()) { 553 errln("testFollowing(): iterator returned DONE prematurely."); 554 } 555 556 // Full check of all results. 557 td.checkResults("testFollowing", this); 558 } 559 560 561 562 void RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) { 563 UErrorCode status = U_ZERO_ERROR; 564 int32_t p; 565 int32_t tag; 566 int32_t lastP = 0x7ffffffe; 567 int i; 568 569 logln("testPreceding():"); 570 bi.setText(td.fDataToBreak); 571 td.clearResults(); 572 573 p = bi.last(); 574 td.fActualBreakPositions.addElement(p, status); 575 tag = bi.getRuleStatus(); 576 td.fActualTags.addElement(tag, status); 577 578 for (i = td.fDataToBreak.length(); i>=-1; i--) { 579 p = bi.preceding(i); 580 if (p != lastP) { 581 if (p == RuleBasedBreakIterator::DONE) { 582 break; 583 } 584 // We've reached a new break position. Save it. 585 td.fActualBreakPositions.insertElementAt(p, 0, status); 586 lastP = p; 587 tag = bi.getRuleStatus(); 588 td.fActualTags.insertElementAt(tag, 0, status); 589 } 590 } 591 // The loop normally exits by means of the break in the middle. 592 // Make sure that the index was at the correct position for the break iterator to have 593 // returned DONE. 594 if (i != 0) { 595 errln("testPreceding(): iterator returned DONE prematurely."); 596 } 597 598 // Full check of all results. 599 td.checkResults("testPreceding", this); 600 } 601 602 603 604 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) { 605 UErrorCode status = U_ZERO_ERROR; 606 int i; 607 int32_t tag; 608 609 logln("testIsBoundary():"); 610 bi.setText(td.fDataToBreak); 611 td.clearResults(); 612 613 for (i = 0; i <= td.fDataToBreak.length(); i++) { 614 if (bi.isBoundary(i)) { 615 td.fActualBreakPositions.addElement(i, status); // Save result. 616 tag = bi.getRuleStatus(); 617 td.fActualTags.addElement(tag, status); 618 } 619 } 620 td.checkResults("testIsBoundary: ", this); 621 } 622 623 624 625 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td) 626 { 627 iterator.setText(td.fDataToBreak); 628 629 RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone(); 630 int32_t offset = iterator.first(); 631 int32_t testOffset; 632 int32_t count = 0; 633 634 logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length()); 635 636 if (*testIterator != iterator) 637 errln("clone() or operator!= failed: two clones compared unequal"); 638 639 do { 640 testOffset = testIterator->first(); 641 testOffset = testIterator->next(count); 642 if (offset != testOffset) 643 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset); 644 645 if (offset != RuleBasedBreakIterator::DONE) { 646 count++; 647 offset = iterator.next(); 648 649 if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) { 650 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset); 651 if (count > 10000 || offset == -1) { 652 errln("operator== failed too many times. Stopping test."); 653 if (offset == -1) { 654 errln("Does (RuleBasedBreakIterator::DONE == -1)?"); 655 } 656 return; 657 } 658 } 659 } 660 } while (offset != RuleBasedBreakIterator::DONE); 661 662 // now do it backwards... 663 offset = iterator.last(); 664 count = 0; 665 666 do { 667 testOffset = testIterator->last(); 668 testOffset = testIterator->next(count); // next() with a negative arg is same as previous 669 if (offset != testOffset) 670 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset); 671 672 if (offset != RuleBasedBreakIterator::DONE) { 673 count--; 674 offset = iterator.previous(); 675 } 676 } while (offset != RuleBasedBreakIterator::DONE); 677 678 delete testIterator; 679 } 680 681 682 //--------------------------------------------- 683 // 684 // other tests 685 // 686 //--------------------------------------------- 687 void RBBITest::TestEmptyString() 688 { 689 UnicodeString text = ""; 690 UErrorCode status = U_ZERO_ERROR; 691 692 BITestData x(status); 693 ADD_DATACHUNK(x, "", 0, status); // Break at start of data 694 RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); 695 if (U_FAILURE(status)) 696 { 697 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status)); 698 return; 699 } 700 generalIteratorTest(*bi, x); 701 delete bi; 702 } 703 704 void RBBITest::TestGetAvailableLocales() 705 { 706 int32_t locCount = 0; 707 const Locale* locList = BreakIterator::getAvailableLocales(locCount); 708 709 if (locCount == 0) 710 dataerrln("getAvailableLocales() returned an empty list!"); 711 // Just make sure that it's returning good memory. 712 int32_t i; 713 for (i = 0; i < locCount; ++i) { 714 logln(locList[i].getName()); 715 } 716 } 717 718 //Testing the BreakIterator::getDisplayName() function 719 void RBBITest::TestGetDisplayName() 720 { 721 UnicodeString result; 722 723 BreakIterator::getDisplayName(Locale::getUS(), result); 724 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)") 725 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \"" 726 + result); 727 728 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result); 729 if (result != "French (France)") 730 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \"" 731 + result); 732 } 733 /** 734 * Test End Behaviour 735 * @bug 4068137 736 */ 737 void RBBITest::TestEndBehaviour() 738 { 739 UErrorCode status = U_ZERO_ERROR; 740 UnicodeString testString("boo."); 741 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status); 742 if (U_FAILURE(status)) 743 { 744 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status)); 745 return; 746 } 747 wb->setText(testString); 748 749 if (wb->first() != 0) 750 errln("Didn't get break at beginning of string."); 751 if (wb->next() != 3) 752 errln("Didn't get break before period in \"boo.\""); 753 if (wb->current() != 4 && wb->next() != 4) 754 errln("Didn't get break at end of string."); 755 delete wb; 756 } 757 /* 758 * @bug 4153072 759 */ 760 void RBBITest::TestBug4153072() { 761 UErrorCode status = U_ZERO_ERROR; 762 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status); 763 if (U_FAILURE(status)) 764 { 765 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status)); 766 return; 767 } 768 UnicodeString str("...Hello, World!..."); 769 int32_t begin = 3; 770 int32_t end = str.length() - 3; 771 UBool onBoundary; 772 773 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin); 774 iter->adoptText(textIterator); 775 int index; 776 // Note: with the switch to UText, there is no way to restrict the 777 // iteration range to begin at an index other than zero. 778 // String character iterators created with a non-zero bound are 779 // treated by RBBI as being empty. 780 for (index = -1; index < begin + 1; ++index) { 781 onBoundary = iter->isBoundary(index); 782 if (index == 0? !onBoundary : onBoundary) { 783 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index + 784 " and begin index = " + begin); 785 } 786 } 787 delete iter; 788 } 789 790 791 // 792 // Test for problem reported by Ashok Matoria on 9 July 2007 793 // One.<kSoftHyphen><kSpace>Two. 794 // 795 // Sentence break at start (0) and then on calling next() it breaks at 796 // 'T' of "Two". Now, at this point if I do next() and 797 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two". 798 // 799 void RBBITest::TestBug5775() { 800 UErrorCode status = U_ZERO_ERROR; 801 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status); 802 TEST_ASSERT_SUCCESS(status); 803 if (U_FAILURE(status)) { 804 return; 805 } 806 // Check for status first for better handling of no data errors. 807 TEST_ASSERT(bi != NULL); 808 if (bi == NULL) { 809 return; 810 } 811 812 UnicodeString s("One.\\u00ad Two.", -1, US_INV); 813 // 01234 56789 814 s = s.unescape(); 815 bi->setText(s); 816 int pos = bi->next(); 817 TEST_ASSERT(pos == 6); 818 pos = bi->next(); 819 TEST_ASSERT(pos == 10); 820 pos = bi->previous(); 821 TEST_ASSERT(pos == 6); 822 delete bi; 823 } 824 825 826 827 //------------------------------------------------------------------------------ 828 // 829 // RBBITest::Extended Run RBBI Tests from an external test data file 830 // 831 //------------------------------------------------------------------------------ 832 833 struct TestParams { 834 BreakIterator *bi; 835 UnicodeString dataToBreak; 836 UVector32 *expectedBreaks; 837 UVector32 *srcLine; 838 UVector32 *srcCol; 839 }; 840 841 void RBBITest::executeTest(TestParams *t) { 842 int32_t bp; 843 int32_t prevBP; 844 int32_t i; 845 846 if (t->bi == NULL) { 847 return; 848 } 849 850 t->bi->setText(t->dataToBreak); 851 // 852 // Run the iterator forward 853 // 854 prevBP = -1; 855 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) { 856 if (prevBP == bp) { 857 // Fail for lack of forward progress. 858 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d", 859 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 860 break; 861 } 862 863 // Check that there were we didn't miss an expected break between the last one 864 // and this one. 865 for (i=prevBP+1; i<bp; i++) { 866 if (t->expectedBreaks->elementAti(i) != 0) { 867 int expected[] = {0, i}; 868 printStringBreaks(t->dataToBreak, expected, 2); 869 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 870 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 871 } 872 } 873 874 // Check that the break we did find was expected 875 if (t->expectedBreaks->elementAti(bp) == 0) { 876 int expected[] = {0, bp}; 877 printStringBreaks(t->dataToBreak, expected, 2); 878 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d", 879 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 880 } else { 881 // The break was expected. 882 // Check that the {nnn} tag value is correct. 883 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp); 884 if (expectedTagVal == -1) { 885 expectedTagVal = 0; 886 } 887 int32_t line = t->srcLine->elementAti(bp); 888 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus(); 889 if (rs != expectedTagVal) { 890 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n" 891 " Actual, Expected status = %4d, %4d", 892 bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal); 893 } 894 } 895 896 897 prevBP = bp; 898 } 899 900 // Verify that there were no missed expected breaks after the last one found 901 for (i=prevBP+1; i<t->expectedBreaks->size(); i++) { 902 if (t->expectedBreaks->elementAti(i) != 0) { 903 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 904 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 905 } 906 } 907 908 // 909 // Run the iterator backwards, verify that the same breaks are found. 910 // 911 prevBP = t->dataToBreak.length()+2; // start with a phony value for the last break pos seen. 912 for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) { 913 if (prevBP == bp) { 914 // Fail for lack of progress. 915 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d", 916 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 917 break; 918 } 919 920 // Check that there were we didn't miss an expected break between the last one 921 // and this one. (UVector returns zeros for index out of bounds.) 922 for (i=prevBP-1; i>bp; i--) { 923 if (t->expectedBreaks->elementAti(i) != 0) { 924 errln("Reverse Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 925 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 926 } 927 } 928 929 // Check that the break we did find was expected 930 if (t->expectedBreaks->elementAti(bp) == 0) { 931 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d", 932 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 933 } else { 934 // The break was expected. 935 // Check that the {nnn} tag value is correct. 936 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp); 937 if (expectedTagVal == -1) { 938 expectedTagVal = 0; 939 } 940 int line = t->srcLine->elementAti(bp); 941 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus(); 942 if (rs != expectedTagVal) { 943 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n" 944 " Actual, Expected status = %4d, %4d", 945 bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal); 946 } 947 } 948 949 prevBP = bp; 950 } 951 952 // Verify that there were no missed breaks prior to the last one found 953 for (i=prevBP-1; i>=0; i--) { 954 if (t->expectedBreaks->elementAti(i) != 0) { 955 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 956 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 957 } 958 } 959 } 960 961 962 void RBBITest::TestExtended() { 963 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 964 UErrorCode status = U_ZERO_ERROR; 965 Locale locale(""); 966 967 UnicodeString rules; 968 TestParams tp; 969 tp.bi = NULL; 970 tp.expectedBreaks = new UVector32(status); 971 tp.srcLine = new UVector32(status); 972 tp.srcCol = new UVector32(status); 973 974 RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status); 975 if (U_FAILURE(status)) { 976 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status)); 977 } 978 979 980 // 981 // Open and read the test data file. 982 // 983 const char *testDataDirectory = IntlTest::getSourceTestData(status); 984 char testFileName[1000]; 985 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) { 986 errln("Can't open test data. Path too long."); 987 return; 988 } 989 strcpy(testFileName, testDataDirectory); 990 strcat(testFileName, "rbbitst.txt"); 991 992 int len; 993 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status); 994 if (U_FAILURE(status)) { 995 return; /* something went wrong, error already output */ 996 } 997 998 999 1000 1001 // 1002 // Put the test data into a UnicodeString 1003 // 1004 UnicodeString testString(FALSE, testFile, len); 1005 1006 enum EParseState{ 1007 PARSE_COMMENT, 1008 PARSE_TAG, 1009 PARSE_DATA, 1010 PARSE_NUM 1011 } 1012 parseState = PARSE_TAG; 1013 1014 EParseState savedState = PARSE_TAG; 1015 1016 static const UChar CH_LF = 0x0a; 1017 static const UChar CH_CR = 0x0d; 1018 static const UChar CH_HASH = 0x23; 1019 /*static const UChar CH_PERIOD = 0x2e;*/ 1020 static const UChar CH_LT = 0x3c; 1021 static const UChar CH_GT = 0x3e; 1022 static const UChar CH_BACKSLASH = 0x5c; 1023 static const UChar CH_BULLET = 0x2022; 1024 1025 int32_t lineNum = 1; 1026 int32_t colStart = 0; 1027 int32_t column = 0; 1028 int32_t charIdx = 0; 1029 1030 int32_t tagValue = 0; // The numeric value of a <nnn> tag. 1031 1032 for (charIdx = 0; charIdx < len; ) { 1033 status = U_ZERO_ERROR; 1034 UChar c = testString.charAt(charIdx); 1035 charIdx++; 1036 if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) { 1037 // treat CRLF as a unit 1038 c = CH_LF; 1039 charIdx++; 1040 } 1041 if (c == CH_LF || c == CH_CR) { 1042 lineNum++; 1043 colStart = charIdx; 1044 } 1045 column = charIdx - colStart + 1; 1046 1047 switch (parseState) { 1048 case PARSE_COMMENT: 1049 if (c == 0x0a || c == 0x0d) { 1050 parseState = savedState; 1051 } 1052 break; 1053 1054 case PARSE_TAG: 1055 { 1056 if (c == CH_HASH) { 1057 parseState = PARSE_COMMENT; 1058 savedState = PARSE_TAG; 1059 break; 1060 } 1061 if (u_isUWhiteSpace(c)) { 1062 break; 1063 } 1064 if (testString.compare(charIdx-1, 6, "<word>") == 0) { 1065 delete tp.bi; 1066 tp.bi = BreakIterator::createWordInstance(locale, status); 1067 charIdx += 5; 1068 break; 1069 } 1070 if (testString.compare(charIdx-1, 6, "<char>") == 0) { 1071 delete tp.bi; 1072 tp.bi = BreakIterator::createCharacterInstance(locale, status); 1073 charIdx += 5; 1074 break; 1075 } 1076 if (testString.compare(charIdx-1, 6, "<line>") == 0) { 1077 delete tp.bi; 1078 tp.bi = BreakIterator::createLineInstance(locale, status); 1079 charIdx += 5; 1080 break; 1081 } 1082 if (testString.compare(charIdx-1, 6, "<sent>") == 0) { 1083 delete tp.bi; 1084 tp.bi = NULL; 1085 tp.bi = BreakIterator::createSentenceInstance(locale, status); 1086 charIdx += 5; 1087 break; 1088 } 1089 if (testString.compare(charIdx-1, 7, "<title>") == 0) { 1090 delete tp.bi; 1091 tp.bi = BreakIterator::createTitleInstance(locale, status); 1092 charIdx += 6; 1093 break; 1094 } 1095 1096 // <locale loc_name> 1097 localeMatcher.reset(testString); 1098 if (localeMatcher.lookingAt(charIdx-1, status)) { 1099 UnicodeString localeName = localeMatcher.group(1, status); 1100 char localeName8[100]; 1101 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0); 1102 locale = Locale::createFromName(localeName8); 1103 charIdx += localeMatcher.group(0, status).length(); 1104 TEST_ASSERT_SUCCESS(status); 1105 break; 1106 } 1107 if (testString.compare(charIdx-1, 6, "<data>") == 0) { 1108 parseState = PARSE_DATA; 1109 charIdx += 5; 1110 tp.dataToBreak = ""; 1111 tp.expectedBreaks->removeAllElements(); 1112 tp.srcCol ->removeAllElements(); 1113 tp.srcLine->removeAllElements(); 1114 break; 1115 } 1116 1117 errln("line %d: Tag expected in test file.", lineNum); 1118 parseState = PARSE_COMMENT; 1119 savedState = PARSE_DATA; 1120 goto end_test; // Stop the test. 1121 } 1122 break; 1123 1124 case PARSE_DATA: 1125 if (c == CH_BULLET) { 1126 int32_t breakIdx = tp.dataToBreak.length(); 1127 tp.expectedBreaks->setSize(breakIdx+1); 1128 tp.expectedBreaks->setElementAt(-1, breakIdx); 1129 tp.srcLine->setSize(breakIdx+1); 1130 tp.srcLine->setElementAt(lineNum, breakIdx); 1131 tp.srcCol ->setSize(breakIdx+1); 1132 tp.srcCol ->setElementAt(column, breakIdx); 1133 break; 1134 } 1135 1136 if (testString.compare(charIdx-1, 7, "</data>") == 0) { 1137 // Add final entry to mappings from break location to source file position. 1138 // Need one extra because last break position returned is after the 1139 // last char in the data, not at the last char. 1140 tp.srcLine->addElement(lineNum, status); 1141 tp.srcCol ->addElement(column, status); 1142 1143 parseState = PARSE_TAG; 1144 charIdx += 6; 1145 1146 // RUN THE TEST! 1147 executeTest(&tp); 1148 break; 1149 } 1150 1151 if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) { 1152 // Named character, e.g. \N{COMBINING GRAVE ACCENT} 1153 // Get the code point from the name and insert it into the test data. 1154 // (Damn, no API takes names in Unicode !!! 1155 // we've got to take it back to char *) 1156 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx); 1157 int32_t nameLength = nameEndIdx - (charIdx+2); 1158 char charNameBuf[200]; 1159 UChar32 theChar = -1; 1160 if (nameEndIdx != -1) { 1161 UErrorCode status = U_ZERO_ERROR; 1162 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf)); 1163 charNameBuf[sizeof(charNameBuf)-1] = 0; 1164 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status); 1165 if (U_FAILURE(status)) { 1166 theChar = -1; 1167 } 1168 } 1169 if (theChar == -1) { 1170 errln("Error in named character in test file at line %d, col %d", 1171 lineNum, column); 1172 } else { 1173 // Named code point was recognized. Insert it 1174 // into the test data. 1175 tp.dataToBreak.append(theChar); 1176 while (tp.dataToBreak.length() > tp.srcLine->size()) { 1177 tp.srcLine->addElement(lineNum, status); 1178 tp.srcCol ->addElement(column, status); 1179 } 1180 } 1181 if (nameEndIdx > charIdx) { 1182 charIdx = nameEndIdx+1; 1183 1184 } 1185 break; 1186 } 1187 1188 1189 1190 1191 if (testString.compare(charIdx-1, 2, "<>") == 0) { 1192 charIdx++; 1193 int32_t breakIdx = tp.dataToBreak.length(); 1194 tp.expectedBreaks->setSize(breakIdx+1); 1195 tp.expectedBreaks->setElementAt(-1, breakIdx); 1196 tp.srcLine->setSize(breakIdx+1); 1197 tp.srcLine->setElementAt(lineNum, breakIdx); 1198 tp.srcCol ->setSize(breakIdx+1); 1199 tp.srcCol ->setElementAt(column, breakIdx); 1200 break; 1201 } 1202 1203 if (c == CH_LT) { 1204 tagValue = 0; 1205 parseState = PARSE_NUM; 1206 break; 1207 } 1208 1209 if (c == CH_HASH && column==3) { // TODO: why is column off so far? 1210 parseState = PARSE_COMMENT; 1211 savedState = PARSE_DATA; 1212 break; 1213 } 1214 1215 if (c == CH_BACKSLASH) { 1216 // Check for \ at end of line, a line continuation. 1217 // Advance over (discard) the newline 1218 UChar32 cp = testString.char32At(charIdx); 1219 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) { 1220 // We have a CR LF 1221 // Need an extra increment of the input ptr to move over both of them 1222 charIdx++; 1223 } 1224 if (cp == CH_LF || cp == CH_CR) { 1225 lineNum++; 1226 colStart = charIdx; 1227 charIdx++; 1228 break; 1229 } 1230 1231 // Let unescape handle the back slash. 1232 cp = testString.unescapeAt(charIdx); 1233 if (cp != -1) { 1234 // Escape sequence was recognized. Insert the char 1235 // into the test data. 1236 tp.dataToBreak.append(cp); 1237 while (tp.dataToBreak.length() > tp.srcLine->size()) { 1238 tp.srcLine->addElement(lineNum, status); 1239 tp.srcCol ->addElement(column, status); 1240 } 1241 break; 1242 } 1243 1244 1245 // Not a recognized backslash escape sequence. 1246 // Take the next char as a literal. 1247 // TODO: Should this be an error? 1248 c = testString.charAt(charIdx); 1249 charIdx = testString.moveIndex32(charIdx, 1); 1250 } 1251 1252 // Normal, non-escaped data char. 1253 tp.dataToBreak.append(c); 1254 1255 // Save the mapping from offset in the data to line/column numbers in 1256 // the original input file. Will be used for better error messages only. 1257 // If there's an expected break before this char, the slot in the mapping 1258 // vector will already be set for this char; don't overwrite it. 1259 if (tp.dataToBreak.length() > tp.srcLine->size()) { 1260 tp.srcLine->addElement(lineNum, status); 1261 tp.srcCol ->addElement(column, status); 1262 } 1263 break; 1264 1265 1266 case PARSE_NUM: 1267 // We are parsing an expected numeric tag value, like <1234>, 1268 // within a chunk of data. 1269 if (u_isUWhiteSpace(c)) { 1270 break; 1271 } 1272 1273 if (c == CH_GT) { 1274 // Finished the number. Add the info to the expected break data, 1275 // and switch parse state back to doing plain data. 1276 parseState = PARSE_DATA; 1277 if (tagValue == 0) { 1278 tagValue = -1; 1279 } 1280 int32_t breakIdx = tp.dataToBreak.length(); 1281 tp.expectedBreaks->setSize(breakIdx+1); 1282 tp.expectedBreaks->setElementAt(tagValue, breakIdx); 1283 tp.srcLine->setSize(breakIdx+1); 1284 tp.srcLine->setElementAt(lineNum, breakIdx); 1285 tp.srcCol ->setSize(breakIdx+1); 1286 tp.srcCol ->setElementAt(column, breakIdx); 1287 break; 1288 } 1289 1290 if (u_isdigit(c)) { 1291 tagValue = tagValue*10 + u_charDigitValue(c); 1292 break; 1293 } 1294 1295 errln("Syntax Error in test file at line %d, col %d", 1296 lineNum, column); 1297 parseState = PARSE_COMMENT; 1298 goto end_test; // Stop the test 1299 break; 1300 } 1301 1302 1303 if (U_FAILURE(status)) { 1304 dataerrln("ICU Error %s while parsing test file at line %d.", 1305 u_errorName(status), lineNum); 1306 status = U_ZERO_ERROR; 1307 goto end_test; // Stop the test 1308 } 1309 1310 } 1311 1312 end_test: 1313 delete tp.bi; 1314 delete tp.expectedBreaks; 1315 delete tp.srcLine; 1316 delete tp.srcCol; 1317 delete [] testFile; 1318 #endif 1319 } 1320 1321 1322 //------------------------------------------------------------------------------- 1323 // 1324 // TestDictRules create a break iterator from source rules that includes a 1325 // dictionary range. Regression for bug #7130. Source rules 1326 // do not declare a break iterator type (word, line, sentence, etc. 1327 // but the dictionary code, without a type, would loop. 1328 // 1329 //------------------------------------------------------------------------------- 1330 void RBBITest::TestDictRules() { 1331 const char *rules = "$dictionary = [a-z]; \n" 1332 "!!forward; \n" 1333 "$dictionary $dictionary; \n" 1334 "!!reverse; \n" 1335 "$dictionary $dictionary; \n"; 1336 const char *text = "aa"; 1337 UErrorCode status = U_ZERO_ERROR; 1338 UParseError parseError; 1339 1340 RuleBasedBreakIterator bi(rules, parseError, status); 1341 if (U_SUCCESS(status)) { 1342 UnicodeString utext = text; 1343 bi.setText(utext); 1344 int32_t position; 1345 int32_t loops; 1346 for (loops = 0; loops<10; loops++) { 1347 position = bi.next(); 1348 if (position == RuleBasedBreakIterator::DONE) { 1349 break; 1350 } 1351 } 1352 TEST_ASSERT(loops == 1); 1353 } else { 1354 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status)); 1355 } 1356 } 1357 1358 1359 1360 //------------------------------------------------------------------------------- 1361 // 1362 // ReadAndConvertFile Read a text data file, convert it to UChars, and 1363 // return the datain one big UChar * buffer, which the caller must delete. 1364 // 1365 // parameters: 1366 // fileName: the name of the file, with no directory part. The test data directory 1367 // is assumed. 1368 // ulen an out parameter, receives the actual length (in UChars) of the file data. 1369 // encoding The file encoding. If the file contains a BOM, that will override the encoding 1370 // specified here. The BOM, if it exists, will be stripped from the returned data. 1371 // Pass NULL for the system default encoding. 1372 // status 1373 // returns: 1374 // The file data, converted to UChar. 1375 // The caller must delete this when done with 1376 // delete [] theBuffer; 1377 // 1378 // TODO: This is a clone of RegexTest::ReadAndConvertFile. 1379 // Move this function to some common place. 1380 // 1381 //-------------------------------------------------------------------------------- 1382 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) { 1383 UChar *retPtr = NULL; 1384 char *fileBuf = NULL; 1385 UConverter* conv = NULL; 1386 FILE *f = NULL; 1387 1388 ulen = 0; 1389 if (U_FAILURE(status)) { 1390 return retPtr; 1391 } 1392 1393 // 1394 // Open the file. 1395 // 1396 f = fopen(fileName, "rb"); 1397 if (f == 0) { 1398 dataerrln("Error opening test data file %s\n", fileName); 1399 status = U_FILE_ACCESS_ERROR; 1400 return NULL; 1401 } 1402 // 1403 // Read it in 1404 // 1405 int fileSize; 1406 int amt_read; 1407 1408 fseek( f, 0, SEEK_END); 1409 fileSize = ftell(f); 1410 fileBuf = new char[fileSize]; 1411 fseek(f, 0, SEEK_SET); 1412 amt_read = fread(fileBuf, 1, fileSize, f); 1413 if (amt_read != fileSize || fileSize <= 0) { 1414 errln("Error reading test data file."); 1415 goto cleanUpAndReturn; 1416 } 1417 1418 // 1419 // Look for a Unicode Signature (BOM) on the data just read 1420 // 1421 int32_t signatureLength; 1422 const char * fileBufC; 1423 const char* bomEncoding; 1424 1425 fileBufC = fileBuf; 1426 bomEncoding = ucnv_detectUnicodeSignature( 1427 fileBuf, fileSize, &signatureLength, &status); 1428 if(bomEncoding!=NULL ){ 1429 fileBufC += signatureLength; 1430 fileSize -= signatureLength; 1431 encoding = bomEncoding; 1432 } 1433 1434 // 1435 // Open a converter to take the rule file to UTF-16 1436 // 1437 conv = ucnv_open(encoding, &status); 1438 if (U_FAILURE(status)) { 1439 goto cleanUpAndReturn; 1440 } 1441 1442 // 1443 // Convert the rules to UChar. 1444 // Preflight first to determine required buffer size. 1445 // 1446 ulen = ucnv_toUChars(conv, 1447 NULL, // dest, 1448 0, // destCapacity, 1449 fileBufC, 1450 fileSize, 1451 &status); 1452 if (status == U_BUFFER_OVERFLOW_ERROR) { 1453 // Buffer Overflow is expected from the preflight operation. 1454 status = U_ZERO_ERROR; 1455 1456 retPtr = new UChar[ulen+1]; 1457 ucnv_toUChars(conv, 1458 retPtr, // dest, 1459 ulen+1, 1460 fileBufC, 1461 fileSize, 1462 &status); 1463 } 1464 1465 cleanUpAndReturn: 1466 fclose(f); 1467 delete []fileBuf; 1468 ucnv_close(conv); 1469 if (U_FAILURE(status)) { 1470 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 1471 delete []retPtr; 1472 retPtr = 0; 1473 ulen = 0; 1474 }; 1475 return retPtr; 1476 } 1477 1478 1479 1480 //-------------------------------------------------------------------------------------------- 1481 // 1482 // Run tests from each of the boundary test data files distributed by the Unicode Consortium 1483 // 1484 //------------------------------------------------------------------------------------------- 1485 void RBBITest::TestUnicodeFiles() { 1486 RuleBasedBreakIterator *bi; 1487 UErrorCode status = U_ZERO_ERROR; 1488 1489 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status); 1490 TEST_ASSERT_SUCCESS(status); 1491 if (U_SUCCESS(status)) { 1492 runUnicodeTestData("GraphemeBreakTest.txt", bi); 1493 } 1494 delete bi; 1495 1496 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status); 1497 TEST_ASSERT_SUCCESS(status); 1498 if (U_SUCCESS(status)) { 1499 runUnicodeTestData("WordBreakTest.txt", bi); 1500 } 1501 delete bi; 1502 1503 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status); 1504 TEST_ASSERT_SUCCESS(status); 1505 if (U_SUCCESS(status)) { 1506 runUnicodeTestData("SentenceBreakTest.txt", bi); 1507 } 1508 delete bi; 1509 1510 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status); 1511 TEST_ASSERT_SUCCESS(status); 1512 if (U_SUCCESS(status)) { 1513 runUnicodeTestData("LineBreakTest.txt", bi); 1514 } 1515 delete bi; 1516 } 1517 1518 1519 //-------------------------------------------------------------------------------------------- 1520 // 1521 // Run tests from one of the boundary test data files distributed by the Unicode Consortium 1522 // 1523 //------------------------------------------------------------------------------------------- 1524 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) { 1525 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 1526 // TODO(andy): Match line break behavior to Unicode 6.0 and remove this time bomb. Ticket #7270 1527 UBool isTicket7270Fixed = isICUVersionAtLeast(52, 1); 1528 UBool isLineBreak = 0 == strcmp(fileName, "LineBreakTest.txt"); 1529 UErrorCode status = U_ZERO_ERROR; 1530 1531 // 1532 // Open and read the test data file, put it into a UnicodeString. 1533 // 1534 const char *testDataDirectory = IntlTest::getSourceTestData(status); 1535 char testFileName[1000]; 1536 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) { 1537 dataerrln("Can't open test data. Path too long."); 1538 return; 1539 } 1540 strcpy(testFileName, testDataDirectory); 1541 strcat(testFileName, fileName); 1542 1543 logln("Opening data file %s\n", fileName); 1544 1545 int len; 1546 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status); 1547 if (status != U_FILE_ACCESS_ERROR) { 1548 TEST_ASSERT_SUCCESS(status); 1549 TEST_ASSERT(testFile != NULL); 1550 } 1551 if (U_FAILURE(status) || testFile == NULL) { 1552 return; /* something went wrong, error already output */ 1553 } 1554 UnicodeString testFileAsString(TRUE, testFile, len); 1555 1556 // 1557 // Parse the test data file using a regular expression. 1558 // Each kind of token is recognized in its own capture group; what type of item was scanned 1559 // is identified by which group had a match. 1560 // 1561 // Caputure Group # 1 2 3 4 5 1562 // Parses this item: divide x hex digits comment \n unrecognized \n 1563 // 1564 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV); 1565 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status); 1566 UnicodeString testString; 1567 UVector32 breakPositions(status); 1568 int lineNumber = 1; 1569 TEST_ASSERT_SUCCESS(status); 1570 if (U_FAILURE(status)) { 1571 return; 1572 } 1573 1574 // 1575 // Scan through each test case, building up the string to be broken in testString, 1576 // and the positions that should be boundaries in the breakPositions vector. 1577 // 1578 int spin = 0; 1579 while (tokenMatcher.find()) { 1580 if(tokenMatcher.hitEnd()) { 1581 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for. 1582 This occurred when the text file was corrupt (wasn't marked as UTF-8) 1583 and caused an infinite loop here on EBCDIC systems! 1584 */ 1585 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin); 1586 // return; 1587 } 1588 if (tokenMatcher.start(1, status) >= 0) { 1589 // Scanned a divide sign, indicating a break position in the test data. 1590 if (testString.length()>0) { 1591 breakPositions.addElement(testString.length(), status); 1592 } 1593 } 1594 else if (tokenMatcher.start(2, status) >= 0) { 1595 // Scanned an 'x', meaning no break at this position in the test data 1596 // Nothing to be done here. 1597 } 1598 else if (tokenMatcher.start(3, status) >= 0) { 1599 // Scanned Hex digits. Convert them to binary, append to the character data string. 1600 const UnicodeString &hexNumber = tokenMatcher.group(3, status); 1601 int length = hexNumber.length(); 1602 if (length<=8) { 1603 char buf[10]; 1604 hexNumber.extract (0, length, buf, sizeof(buf), US_INV); 1605 UChar32 c = (UChar32)strtol(buf, NULL, 16); 1606 if (c<=0x10ffff) { 1607 testString.append(c); 1608 } else { 1609 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n", 1610 fileName, lineNumber); 1611 } 1612 } else { 1613 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n", 1614 fileName, lineNumber); 1615 } 1616 } 1617 else if (tokenMatcher.start(4, status) >= 0) { 1618 // Scanned to end of a line, possibly skipping over a comment in the process. 1619 // If the line from the file contained test data, run the test now. 1620 // 1621 if (testString.length() > 0) { 1622 // TODO(andy): Remove this time bomb code. Note: Failing line numbers may change when updating to new Unicode data. 1623 // Rule 8 1624 // ZW SP* <break> 1625 // is not yet implemented. 1626 if (!(isLineBreak && !isTicket7270Fixed && (5198 == lineNumber || 1627 5202 == lineNumber || 1628 5214 == lineNumber || 1629 5246 == lineNumber || 1630 5298 == lineNumber || 1631 5302 == lineNumber ))) { 1632 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi); 1633 } 1634 } 1635 1636 // Clear out this test case. 1637 // The string and breakPositions vector will be refilled as the next 1638 // test case is parsed. 1639 testString.remove(); 1640 breakPositions.removeAllElements(); 1641 lineNumber++; 1642 } else { 1643 // Scanner catchall. Something unrecognized appeared on the line. 1644 char token[16]; 1645 UnicodeString uToken = tokenMatcher.group(0, status); 1646 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token)); 1647 token[sizeof(token)-1] = 0; 1648 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token); 1649 1650 // Clean up, in preparation for continuing with the next line. 1651 testString.remove(); 1652 breakPositions.removeAllElements(); 1653 lineNumber++; 1654 } 1655 TEST_ASSERT_SUCCESS(status); 1656 if (U_FAILURE(status)) { 1657 break; 1658 } 1659 } 1660 1661 delete [] testFile; 1662 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 1663 } 1664 1665 //-------------------------------------------------------------------------------------------- 1666 // 1667 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium 1668 // test data files. Do only a simple, forward-only check - 1669 // this test is mostly to check that ICU and the Unicode 1670 // data agree with each other. 1671 // 1672 //-------------------------------------------------------------------------------------------- 1673 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber, 1674 const UnicodeString &testString, // Text data to be broken 1675 UVector32 *breakPositions, // Positions where breaks should be found. 1676 RuleBasedBreakIterator *bi) { 1677 int32_t pos; // Break Position in the test string 1678 int32_t expectedI = 0; // Index of expected break position in the vector of expected results. 1679 int32_t expectedPos; // Expected break position (index into test string) 1680 1681 bi->setText(testString); 1682 pos = bi->first(); 1683 pos = bi->next(); 1684 1685 while (pos != BreakIterator::DONE) { 1686 if (expectedI >= breakPositions->size()) { 1687 errln("Test file \"%s\", line %d, unexpected break found at position %d", 1688 testFileName, lineNumber, pos); 1689 break; 1690 } 1691 expectedPos = breakPositions->elementAti(expectedI); 1692 if (pos < expectedPos) { 1693 errln("Test file \"%s\", line %d, unexpected break found at position %d", 1694 testFileName, lineNumber, pos); 1695 break; 1696 } 1697 if (pos > expectedPos) { 1698 errln("Test file \"%s\", line %d, failed to find expected break at position %d", 1699 testFileName, lineNumber, expectedPos); 1700 break; 1701 } 1702 pos = bi->next(); 1703 expectedI++; 1704 } 1705 1706 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) { 1707 errln("Test file \"%s\", line %d, failed to find expected break at position %d", 1708 testFileName, lineNumber, breakPositions->elementAti(expectedI)); 1709 } 1710 } 1711 1712 1713 1714 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 1715 //--------------------------------------------------------------------------------------- 1716 // 1717 // classs RBBIMonkeyKind 1718 // 1719 // Monkey Test for Break Iteration 1720 // Abstract interface class. Concrete derived classes independently 1721 // implement the break rules for different iterator types. 1722 // 1723 // The Monkey Test itself uses doesn't know which type of break iterator it is 1724 // testing, but works purely in terms of the interface defined here. 1725 // 1726 //--------------------------------------------------------------------------------------- 1727 class RBBIMonkeyKind { 1728 public: 1729 // Return a UVector of UnicodeSets, representing the character classes used 1730 // for this type of iterator. 1731 virtual UVector *charClasses() = 0; 1732 1733 // Set the test text on which subsequent calls to next() will operate 1734 virtual void setText(const UnicodeString &s) = 0; 1735 1736 // Find the next break postion, starting from the prev break position, or from zero. 1737 // Return -1 after reaching end of string. 1738 virtual int32_t next(int32_t i) = 0; 1739 1740 virtual ~RBBIMonkeyKind(); 1741 UErrorCode deferredStatus; 1742 1743 1744 protected: 1745 RBBIMonkeyKind(); 1746 1747 private: 1748 }; 1749 1750 RBBIMonkeyKind::RBBIMonkeyKind() { 1751 deferredStatus = U_ZERO_ERROR; 1752 } 1753 1754 RBBIMonkeyKind::~RBBIMonkeyKind() { 1755 } 1756 1757 1758 //---------------------------------------------------------------------------------------- 1759 // 1760 // Random Numbers. Similar to standard lib rand() and srand() 1761 // Not using library to 1762 // 1. Get same results on all platforms. 1763 // 2. Get access to current seed, to more easily reproduce failures. 1764 // 1765 //--------------------------------------------------------------------------------------- 1766 static uint32_t m_seed = 1; 1767 1768 static uint32_t m_rand() 1769 { 1770 m_seed = m_seed * 1103515245 + 12345; 1771 return (uint32_t)(m_seed/65536) % 32768; 1772 } 1773 1774 1775 //------------------------------------------------------------------------------------------ 1776 // 1777 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation 1778 // of RBBIMonkeyKind. 1779 // 1780 //------------------------------------------------------------------------------------------ 1781 class RBBICharMonkey: public RBBIMonkeyKind { 1782 public: 1783 RBBICharMonkey(); 1784 virtual ~RBBICharMonkey(); 1785 virtual UVector *charClasses(); 1786 virtual void setText(const UnicodeString &s); 1787 virtual int32_t next(int32_t i); 1788 private: 1789 UVector *fSets; 1790 1791 UnicodeSet *fCRLFSet; 1792 UnicodeSet *fControlSet; 1793 UnicodeSet *fExtendSet; 1794 UnicodeSet *fRegionalIndicatorSet; 1795 UnicodeSet *fPrependSet; 1796 UnicodeSet *fSpacingSet; 1797 UnicodeSet *fLSet; 1798 UnicodeSet *fVSet; 1799 UnicodeSet *fTSet; 1800 UnicodeSet *fLVSet; 1801 UnicodeSet *fLVTSet; 1802 UnicodeSet *fHangulSet; 1803 UnicodeSet *fAnySet; 1804 1805 const UnicodeString *fText; 1806 }; 1807 1808 1809 RBBICharMonkey::RBBICharMonkey() { 1810 UErrorCode status = U_ZERO_ERROR; 1811 1812 fText = NULL; 1813 1814 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status); 1815 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status); 1816 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status); 1817 fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status); 1818 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status); 1819 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status); 1820 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status); 1821 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status); 1822 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status); 1823 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status); 1824 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status); 1825 fHangulSet = new UnicodeSet(); 1826 fHangulSet->addAll(*fLSet); 1827 fHangulSet->addAll(*fVSet); 1828 fHangulSet->addAll(*fTSet); 1829 fHangulSet->addAll(*fLVSet); 1830 fHangulSet->addAll(*fLVTSet); 1831 fAnySet = new UnicodeSet(0, 0x10ffff); 1832 1833 fSets = new UVector(status); 1834 fSets->addElement(fCRLFSet, status); 1835 fSets->addElement(fControlSet, status); 1836 fSets->addElement(fExtendSet, status); 1837 fSets->addElement(fRegionalIndicatorSet, status); 1838 if (!fPrependSet->isEmpty()) { 1839 fSets->addElement(fPrependSet, status); 1840 } 1841 fSets->addElement(fSpacingSet, status); 1842 fSets->addElement(fHangulSet, status); 1843 fSets->addElement(fAnySet, status); 1844 if (U_FAILURE(status)) { 1845 deferredStatus = status; 1846 } 1847 } 1848 1849 1850 void RBBICharMonkey::setText(const UnicodeString &s) { 1851 fText = &s; 1852 } 1853 1854 1855 1856 int32_t RBBICharMonkey::next(int32_t prevPos) { 1857 int p0, p1, p2, p3; // Indices of the significant code points around the 1858 // break position being tested. The candidate break 1859 // location is before p2. 1860 1861 int breakPos = -1; 1862 1863 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 1864 1865 if (U_FAILURE(deferredStatus)) { 1866 return -1; 1867 } 1868 1869 // Previous break at end of string. return DONE. 1870 if (prevPos >= fText->length()) { 1871 return -1; 1872 } 1873 p0 = p1 = p2 = p3 = prevPos; 1874 c3 = fText->char32At(prevPos); 1875 c0 = c1 = c2 = 0; 1876 1877 // Loop runs once per "significant" character position in the input text. 1878 for (;;) { 1879 // Move all of the positions forward in the input string. 1880 p0 = p1; c0 = c1; 1881 p1 = p2; c1 = c2; 1882 p2 = p3; c2 = c3; 1883 1884 // Advancd p3 by one codepoint 1885 p3 = fText->moveIndex32(p3, 1); 1886 c3 = fText->char32At(p3); 1887 1888 if (p1 == p2) { 1889 // Still warming up the loop. (won't work with zero length strings, but we don't care) 1890 continue; 1891 } 1892 if (p2 == fText->length()) { 1893 // Reached end of string. Always a break position. 1894 break; 1895 } 1896 1897 // Rule GB3 CR x LF 1898 // No Extend or Format characters may appear between the CR and LF, 1899 // which requires the additional check for p2 immediately following p1. 1900 // 1901 if (c1==0x0D && c2==0x0A && p1==(p2-1)) { 1902 continue; 1903 } 1904 1905 // Rule (GB4). ( Control | CR | LF ) <break> 1906 if (fControlSet->contains(c1) || 1907 c1 == 0x0D || 1908 c1 == 0x0A) { 1909 break; 1910 } 1911 1912 // Rule (GB5) <break> ( Control | CR | LF ) 1913 // 1914 if (fControlSet->contains(c2) || 1915 c2 == 0x0D || 1916 c2 == 0x0A) { 1917 break; 1918 } 1919 1920 1921 // Rule (GB6) L x ( L | V | LV | LVT ) 1922 if (fLSet->contains(c1) && 1923 (fLSet->contains(c2) || 1924 fVSet->contains(c2) || 1925 fLVSet->contains(c2) || 1926 fLVTSet->contains(c2))) { 1927 continue; 1928 } 1929 1930 // Rule (GB7) ( LV | V ) x ( V | T ) 1931 if ((fLVSet->contains(c1) || fVSet->contains(c1)) && 1932 (fVSet->contains(c2) || fTSet->contains(c2))) { 1933 continue; 1934 } 1935 1936 // Rule (GB8) ( LVT | T) x T 1937 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) && 1938 fTSet->contains(c2)) { 1939 continue; 1940 } 1941 1942 // Rule (GB8a) Regional_Indicator x Regional_Indicator 1943 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) { 1944 continue; 1945 } 1946 1947 // Rule (GB9) Numeric x ALetter 1948 if (fExtendSet->contains(c2)) { 1949 continue; 1950 } 1951 1952 // Rule (GB9a) x SpacingMark 1953 if (fSpacingSet->contains(c2)) { 1954 continue; 1955 } 1956 1957 // Rule (GB9b) Prepend x 1958 if (fPrependSet->contains(c1)) { 1959 continue; 1960 } 1961 1962 // Rule (GB10) Any <break> Any 1963 break; 1964 } 1965 1966 breakPos = p2; 1967 return breakPos; 1968 } 1969 1970 1971 1972 UVector *RBBICharMonkey::charClasses() { 1973 return fSets; 1974 } 1975 1976 1977 RBBICharMonkey::~RBBICharMonkey() { 1978 delete fSets; 1979 delete fCRLFSet; 1980 delete fControlSet; 1981 delete fExtendSet; 1982 delete fRegionalIndicatorSet; 1983 delete fPrependSet; 1984 delete fSpacingSet; 1985 delete fLSet; 1986 delete fVSet; 1987 delete fTSet; 1988 delete fLVSet; 1989 delete fLVTSet; 1990 delete fHangulSet; 1991 delete fAnySet; 1992 } 1993 1994 //------------------------------------------------------------------------------------------ 1995 // 1996 // class RBBIWordMonkey Word Break specific implementation 1997 // of RBBIMonkeyKind. 1998 // 1999 //------------------------------------------------------------------------------------------ 2000 class RBBIWordMonkey: public RBBIMonkeyKind { 2001 public: 2002 RBBIWordMonkey(); 2003 virtual ~RBBIWordMonkey(); 2004 virtual UVector *charClasses(); 2005 virtual void setText(const UnicodeString &s); 2006 virtual int32_t next(int32_t i); 2007 private: 2008 UVector *fSets; 2009 2010 UnicodeSet *fCRSet; 2011 UnicodeSet *fLFSet; 2012 UnicodeSet *fNewlineSet; 2013 UnicodeSet *fKatakanaSet; 2014 UnicodeSet *fALetterSet; 2015 // TODO(jungshik): Do we still need this change? 2016 // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt 2017 UnicodeSet *fMidNumLetSet; 2018 UnicodeSet *fMidLetterSet; 2019 UnicodeSet *fMidNumSet; 2020 UnicodeSet *fNumericSet; 2021 UnicodeSet *fFormatSet; 2022 UnicodeSet *fOtherSet; 2023 UnicodeSet *fExtendSet; 2024 UnicodeSet *fExtendNumLetSet; 2025 UnicodeSet *fRegionalIndicatorSet; 2026 UnicodeSet *fDictionaryCjkSet; 2027 2028 RegexMatcher *fMatcher; 2029 2030 const UnicodeString *fText; 2031 }; 2032 2033 2034 RBBIWordMonkey::RBBIWordMonkey() 2035 { 2036 UErrorCode status = U_ZERO_ERROR; 2037 2038 fSets = new UVector(status); 2039 2040 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status); 2041 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status); 2042 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status); 2043 fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status); 2044 // Exclude Hangul syllables from ALetterSet during testing. 2045 // Leave CJK dictionary characters out from the monkey tests! 2046 #if 0 2047 fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}" 2048 "[\\p{Line_Break = Complex_Context}" 2049 "-\\p{Grapheme_Cluster_Break = Extend}" 2050 "-\\p{Grapheme_Cluster_Break = Control}" 2051 "]]", 2052 status); 2053 #endif 2054 fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status); 2055 fALetterSet->removeAll(*fDictionaryCjkSet); 2056 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status); 2057 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status); 2058 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status); 2059 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status); 2060 // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test 2061 // we should figure out why 2062 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status); 2063 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status); 2064 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status); 2065 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status); 2066 fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status); 2067 2068 fOtherSet = new UnicodeSet(); 2069 if(U_FAILURE(status)) { 2070 deferredStatus = status; 2071 return; 2072 } 2073 2074 fOtherSet->complement(); 2075 fOtherSet->removeAll(*fCRSet); 2076 fOtherSet->removeAll(*fLFSet); 2077 fOtherSet->removeAll(*fNewlineSet); 2078 fOtherSet->removeAll(*fKatakanaSet); 2079 fOtherSet->removeAll(*fALetterSet); 2080 fOtherSet->removeAll(*fMidLetterSet); 2081 fOtherSet->removeAll(*fMidNumSet); 2082 fOtherSet->removeAll(*fNumericSet); 2083 fOtherSet->removeAll(*fExtendNumLetSet); 2084 fOtherSet->removeAll(*fFormatSet); 2085 fOtherSet->removeAll(*fExtendSet); 2086 fOtherSet->removeAll(*fRegionalIndicatorSet); 2087 // Inhibit dictionary characters from being tested at all. 2088 fOtherSet->removeAll(*fDictionaryCjkSet); 2089 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status)); 2090 2091 fSets->addElement(fCRSet, status); 2092 fSets->addElement(fLFSet, status); 2093 fSets->addElement(fNewlineSet, status); 2094 fSets->addElement(fALetterSet, status); 2095 //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test katakana 2096 fSets->addElement(fMidLetterSet, status); 2097 fSets->addElement(fMidNumLetSet, status); 2098 fSets->addElement(fMidNumSet, status); 2099 fSets->addElement(fNumericSet, status); 2100 fSets->addElement(fFormatSet, status); 2101 fSets->addElement(fExtendSet, status); 2102 fSets->addElement(fOtherSet, status); 2103 fSets->addElement(fExtendNumLetSet, status); 2104 fSets->addElement(fRegionalIndicatorSet, status); 2105 2106 if (U_FAILURE(status)) { 2107 deferredStatus = status; 2108 } 2109 } 2110 2111 void RBBIWordMonkey::setText(const UnicodeString &s) { 2112 fText = &s; 2113 } 2114 2115 2116 int32_t RBBIWordMonkey::next(int32_t prevPos) { 2117 int p0, p1, p2, p3; // Indices of the significant code points around the 2118 // break position being tested. The candidate break 2119 // location is before p2. 2120 2121 int breakPos = -1; 2122 2123 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 2124 2125 if (U_FAILURE(deferredStatus)) { 2126 return -1; 2127 } 2128 2129 // Prev break at end of string. return DONE. 2130 if (prevPos >= fText->length()) { 2131 return -1; 2132 } 2133 p0 = p1 = p2 = p3 = prevPos; 2134 c3 = fText->char32At(prevPos); 2135 c0 = c1 = c2 = 0; 2136 2137 // Loop runs once per "significant" character position in the input text. 2138 for (;;) { 2139 // Move all of the positions forward in the input string. 2140 p0 = p1; c0 = c1; 2141 p1 = p2; c1 = c2; 2142 p2 = p3; c2 = c3; 2143 2144 // Advancd p3 by X(Extend | Format)* Rule 4 2145 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change) 2146 do { 2147 p3 = fText->moveIndex32(p3, 1); 2148 c3 = fText->char32At(p3); 2149 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { 2150 break; 2151 }; 2152 } 2153 while (fFormatSet->contains(c3) || fExtendSet->contains(c3)); 2154 2155 2156 if (p1 == p2) { 2157 // Still warming up the loop. (won't work with zero length strings, but we don't care) 2158 continue; 2159 } 2160 if (p2 == fText->length()) { 2161 // Reached end of string. Always a break position. 2162 break; 2163 } 2164 2165 // Rule (3) CR x LF 2166 // No Extend or Format characters may appear between the CR and LF, 2167 // which requires the additional check for p2 immediately following p1. 2168 // 2169 if (c1==0x0D && c2==0x0A) { 2170 continue; 2171 } 2172 2173 // Rule (3a) Break before and after newlines (including CR and LF) 2174 // 2175 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) { 2176 break; 2177 }; 2178 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { 2179 break; 2180 }; 2181 2182 // Rule (5). ALetter x ALetter 2183 if (fALetterSet->contains(c1) && 2184 fALetterSet->contains(c2)) { 2185 continue; 2186 } 2187 2188 // Rule (6) ALetter x (MidLetter | MidNumLet) ALetter 2189 // 2190 if ( fALetterSet->contains(c1) && 2191 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2)) && 2192 fALetterSet->contains(c3)) { 2193 continue; 2194 } 2195 2196 2197 // Rule (7) ALetter (MidLetter | MidNumLet) x ALetter 2198 if (fALetterSet->contains(c0) && 2199 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1)) && 2200 fALetterSet->contains(c2)) { 2201 continue; 2202 } 2203 2204 // Rule (8) Numeric x Numeric 2205 if (fNumericSet->contains(c1) && 2206 fNumericSet->contains(c2)) { 2207 continue; 2208 } 2209 2210 // Rule (9) ALetter x Numeric 2211 if (fALetterSet->contains(c1) && 2212 fNumericSet->contains(c2)) { 2213 continue; 2214 } 2215 2216 // Rule (10) Numeric x ALetter 2217 if (fNumericSet->contains(c1) && 2218 fALetterSet->contains(c2)) { 2219 continue; 2220 } 2221 2222 // Rule (11) Numeric (MidNum | MidNumLet) x Numeric 2223 if (fNumericSet->contains(c0) && 2224 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1)) && 2225 fNumericSet->contains(c2)) { 2226 continue; 2227 } 2228 2229 // Rule (12) Numeric x (MidNum | MidNumLet) Numeric 2230 if (fNumericSet->contains(c1) && 2231 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2)) && 2232 fNumericSet->contains(c3)) { 2233 continue; 2234 } 2235 2236 // Rule (13) Katakana x Katakana 2237 if (fKatakanaSet->contains(c1) && 2238 fKatakanaSet->contains(c2)) { 2239 continue; 2240 } 2241 2242 // Rule 13a 2243 if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) || 2244 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) && 2245 fExtendNumLetSet->contains(c2)) { 2246 continue; 2247 } 2248 2249 // Rule 13b 2250 if (fExtendNumLetSet->contains(c1) && 2251 (fALetterSet->contains(c2) || fNumericSet->contains(c2) || 2252 fKatakanaSet->contains(c2))) { 2253 continue; 2254 } 2255 2256 // Rule 13c 2257 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) { 2258 continue; 2259 } 2260 2261 // Rule 14. Break found here. 2262 break; 2263 } 2264 2265 breakPos = p2; 2266 return breakPos; 2267 } 2268 2269 2270 UVector *RBBIWordMonkey::charClasses() { 2271 return fSets; 2272 } 2273 2274 2275 RBBIWordMonkey::~RBBIWordMonkey() { 2276 delete fSets; 2277 delete fCRSet; 2278 delete fLFSet; 2279 delete fNewlineSet; 2280 delete fKatakanaSet; 2281 delete fALetterSet; 2282 delete fMidNumLetSet; 2283 delete fMidLetterSet; 2284 delete fMidNumSet; 2285 delete fNumericSet; 2286 delete fFormatSet; 2287 delete fExtendSet; 2288 delete fExtendNumLetSet; 2289 delete fRegionalIndicatorSet; 2290 delete fDictionaryCjkSet; 2291 delete fOtherSet; 2292 } 2293 2294 2295 2296 2297 //------------------------------------------------------------------------------------------ 2298 // 2299 // class RBBISentMonkey Sentence Break specific implementation 2300 // of RBBIMonkeyKind. 2301 // 2302 //------------------------------------------------------------------------------------------ 2303 class RBBISentMonkey: public RBBIMonkeyKind { 2304 public: 2305 RBBISentMonkey(); 2306 virtual ~RBBISentMonkey(); 2307 virtual UVector *charClasses(); 2308 virtual void setText(const UnicodeString &s); 2309 virtual int32_t next(int32_t i); 2310 private: 2311 int moveBack(int posFrom); 2312 int moveForward(int posFrom); 2313 UChar32 cAt(int pos); 2314 2315 UVector *fSets; 2316 2317 UnicodeSet *fSepSet; 2318 UnicodeSet *fFormatSet; 2319 UnicodeSet *fSpSet; 2320 UnicodeSet *fLowerSet; 2321 UnicodeSet *fUpperSet; 2322 UnicodeSet *fOLetterSet; 2323 UnicodeSet *fNumericSet; 2324 UnicodeSet *fATermSet; 2325 UnicodeSet *fSContinueSet; 2326 UnicodeSet *fSTermSet; 2327 UnicodeSet *fCloseSet; 2328 UnicodeSet *fOtherSet; 2329 UnicodeSet *fExtendSet; 2330 2331 const UnicodeString *fText; 2332 2333 }; 2334 2335 RBBISentMonkey::RBBISentMonkey() 2336 { 2337 UErrorCode status = U_ZERO_ERROR; 2338 2339 fSets = new UVector(status); 2340 2341 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator 2342 // set and made into character classes of their own. For the monkey impl, 2343 // they remain in SEP, since Sep always appears with CR and LF in the rules. 2344 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status); 2345 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status); 2346 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status); 2347 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status); 2348 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status); 2349 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status); 2350 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status); 2351 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status); 2352 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status); 2353 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status); 2354 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status); 2355 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status); 2356 fOtherSet = new UnicodeSet(); 2357 2358 if(U_FAILURE(status)) { 2359 deferredStatus = status; 2360 return; 2361 } 2362 2363 fOtherSet->complement(); 2364 fOtherSet->removeAll(*fSepSet); 2365 fOtherSet->removeAll(*fFormatSet); 2366 fOtherSet->removeAll(*fSpSet); 2367 fOtherSet->removeAll(*fLowerSet); 2368 fOtherSet->removeAll(*fUpperSet); 2369 fOtherSet->removeAll(*fOLetterSet); 2370 fOtherSet->removeAll(*fNumericSet); 2371 fOtherSet->removeAll(*fATermSet); 2372 fOtherSet->removeAll(*fSContinueSet); 2373 fOtherSet->removeAll(*fSTermSet); 2374 fOtherSet->removeAll(*fCloseSet); 2375 fOtherSet->removeAll(*fExtendSet); 2376 2377 fSets->addElement(fSepSet, status); 2378 fSets->addElement(fFormatSet, status); 2379 fSets->addElement(fSpSet, status); 2380 fSets->addElement(fLowerSet, status); 2381 fSets->addElement(fUpperSet, status); 2382 fSets->addElement(fOLetterSet, status); 2383 fSets->addElement(fNumericSet, status); 2384 fSets->addElement(fATermSet, status); 2385 fSets->addElement(fSContinueSet, status); 2386 fSets->addElement(fSTermSet, status); 2387 fSets->addElement(fCloseSet, status); 2388 fSets->addElement(fOtherSet, status); 2389 fSets->addElement(fExtendSet, status); 2390 2391 if (U_FAILURE(status)) { 2392 deferredStatus = status; 2393 } 2394 } 2395 2396 2397 2398 void RBBISentMonkey::setText(const UnicodeString &s) { 2399 fText = &s; 2400 } 2401 2402 UVector *RBBISentMonkey::charClasses() { 2403 return fSets; 2404 } 2405 2406 2407 // moveBack() Find the "significant" code point preceding the index i. 2408 // Skips over ($Extend | $Format)* . 2409 // 2410 int RBBISentMonkey::moveBack(int i) { 2411 if (i <= 0) { 2412 return -1; 2413 } 2414 UChar32 c; 2415 int32_t j = i; 2416 do { 2417 j = fText->moveIndex32(j, -1); 2418 c = fText->char32At(j); 2419 } 2420 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c))); 2421 return j; 2422 2423 } 2424 2425 2426 int RBBISentMonkey::moveForward(int i) { 2427 if (i>=fText->length()) { 2428 return fText->length(); 2429 } 2430 UChar32 c; 2431 int32_t j = i; 2432 do { 2433 j = fText->moveIndex32(j, 1); 2434 c = cAt(j); 2435 } 2436 while (fFormatSet->contains(c) || fExtendSet->contains(c)); 2437 return j; 2438 } 2439 2440 UChar32 RBBISentMonkey::cAt(int pos) { 2441 if (pos<0 || pos>=fText->length()) { 2442 return -1; 2443 } else { 2444 return fText->char32At(pos); 2445 } 2446 } 2447 2448 int32_t RBBISentMonkey::next(int32_t prevPos) { 2449 int p0, p1, p2, p3; // Indices of the significant code points around the 2450 // break position being tested. The candidate break 2451 // location is before p2. 2452 2453 int breakPos = -1; 2454 2455 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 2456 UChar32 c; 2457 2458 if (U_FAILURE(deferredStatus)) { 2459 return -1; 2460 } 2461 2462 // Prev break at end of string. return DONE. 2463 if (prevPos >= fText->length()) { 2464 return -1; 2465 } 2466 p0 = p1 = p2 = p3 = prevPos; 2467 c3 = fText->char32At(prevPos); 2468 c0 = c1 = c2 = 0; 2469 2470 // Loop runs once per "significant" character position in the input text. 2471 for (;;) { 2472 // Move all of the positions forward in the input string. 2473 p0 = p1; c0 = c1; 2474 p1 = p2; c1 = c2; 2475 p2 = p3; c2 = c3; 2476 2477 // Advancd p3 by X(Extend | Format)* Rule 4 2478 p3 = moveForward(p3); 2479 c3 = cAt(p3); 2480 2481 // Rule (3) CR x LF 2482 if (c1==0x0d && c2==0x0a && p2==(p1+1)) { 2483 continue; 2484 } 2485 2486 // Rule (4). Sep <break> 2487 if (fSepSet->contains(c1)) { 2488 p2 = p1+1; // Separators don't combine with Extend or Format. 2489 break; 2490 } 2491 2492 if (p2 >= fText->length()) { 2493 // Reached end of string. Always a break position. 2494 break; 2495 } 2496 2497 if (p2 == prevPos) { 2498 // Still warming up the loop. (won't work with zero length strings, but we don't care) 2499 continue; 2500 } 2501 2502 // Rule (6). ATerm x Numeric 2503 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) { 2504 continue; 2505 } 2506 2507 // Rule (7). Upper ATerm x Uppper 2508 if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) { 2509 continue; 2510 } 2511 2512 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower 2513 // Note: STerm | ATerm are added to the negated part of the expression by a 2514 // note to the Unicode 5.0 documents. 2515 int p8 = p1; 2516 while (fSpSet->contains(cAt(p8))) { 2517 p8 = moveBack(p8); 2518 } 2519 while (fCloseSet->contains(cAt(p8))) { 2520 p8 = moveBack(p8); 2521 } 2522 if (fATermSet->contains(cAt(p8))) { 2523 p8=p2; 2524 for (;;) { 2525 c = cAt(p8); 2526 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) || 2527 fLowerSet->contains(c) || fSepSet->contains(c) || 2528 fATermSet->contains(c) || fSTermSet->contains(c)) { 2529 break; 2530 } 2531 p8 = moveForward(p8); 2532 } 2533 if (fLowerSet->contains(cAt(p8))) { 2534 continue; 2535 } 2536 } 2537 2538 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm); 2539 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) { 2540 p8 = p1; 2541 while (fSpSet->contains(cAt(p8))) { 2542 p8 = moveBack(p8); 2543 } 2544 while (fCloseSet->contains(cAt(p8))) { 2545 p8 = moveBack(p8); 2546 } 2547 c = cAt(p8); 2548 if (fSTermSet->contains(c) || fATermSet->contains(c)) { 2549 continue; 2550 } 2551 } 2552 2553 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF) 2554 int p9 = p1; 2555 while (fCloseSet->contains(cAt(p9))) { 2556 p9 = moveBack(p9); 2557 } 2558 c = cAt(p9); 2559 if ((fSTermSet->contains(c) || fATermSet->contains(c))) { 2560 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) { 2561 continue; 2562 } 2563 } 2564 2565 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF) 2566 int p10 = p1; 2567 while (fSpSet->contains(cAt(p10))) { 2568 p10 = moveBack(p10); 2569 } 2570 while (fCloseSet->contains(cAt(p10))) { 2571 p10 = moveBack(p10); 2572 } 2573 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) { 2574 if (fSpSet->contains(c2) || fSepSet->contains(c2)) { 2575 continue; 2576 } 2577 } 2578 2579 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break> 2580 int p11 = p1; 2581 if (fSepSet->contains(cAt(p11))) { 2582 p11 = moveBack(p11); 2583 } 2584 while (fSpSet->contains(cAt(p11))) { 2585 p11 = moveBack(p11); 2586 } 2587 while (fCloseSet->contains(cAt(p11))) { 2588 p11 = moveBack(p11); 2589 } 2590 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) { 2591 break; 2592 } 2593 2594 // Rule (12) Any x Any 2595 continue; 2596 } 2597 breakPos = p2; 2598 return breakPos; 2599 } 2600 2601 RBBISentMonkey::~RBBISentMonkey() { 2602 delete fSets; 2603 delete fSepSet; 2604 delete fFormatSet; 2605 delete fSpSet; 2606 delete fLowerSet; 2607 delete fUpperSet; 2608 delete fOLetterSet; 2609 delete fNumericSet; 2610 delete fATermSet; 2611 delete fSContinueSet; 2612 delete fSTermSet; 2613 delete fCloseSet; 2614 delete fOtherSet; 2615 delete fExtendSet; 2616 } 2617 2618 2619 2620 //------------------------------------------------------------------------------------------- 2621 // 2622 // RBBILineMonkey 2623 // 2624 //------------------------------------------------------------------------------------------- 2625 2626 class RBBILineMonkey: public RBBIMonkeyKind { 2627 public: 2628 RBBILineMonkey(); 2629 virtual ~RBBILineMonkey(); 2630 virtual UVector *charClasses(); 2631 virtual void setText(const UnicodeString &s); 2632 virtual int32_t next(int32_t i); 2633 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar); 2634 private: 2635 UVector *fSets; 2636 2637 UnicodeSet *fBK; 2638 UnicodeSet *fCR; 2639 UnicodeSet *fLF; 2640 UnicodeSet *fCM; 2641 UnicodeSet *fNL; 2642 UnicodeSet *fSG; 2643 UnicodeSet *fWJ; 2644 UnicodeSet *fZW; 2645 UnicodeSet *fGL; 2646 UnicodeSet *fCB; 2647 UnicodeSet *fSP; 2648 UnicodeSet *fB2; 2649 UnicodeSet *fBA; 2650 UnicodeSet *fBB; 2651 UnicodeSet *fHY; 2652 UnicodeSet *fH2; 2653 UnicodeSet *fH3; 2654 UnicodeSet *fCL; 2655 UnicodeSet *fCP; 2656 UnicodeSet *fEX; 2657 UnicodeSet *fIN; 2658 UnicodeSet *fJL; 2659 UnicodeSet *fJV; 2660 UnicodeSet *fJT; 2661 UnicodeSet *fNS; 2662 UnicodeSet *fOP; 2663 UnicodeSet *fQU; 2664 UnicodeSet *fIS; 2665 UnicodeSet *fNU; 2666 UnicodeSet *fPO; 2667 UnicodeSet *fPR; 2668 UnicodeSet *fSY; 2669 UnicodeSet *fAI; 2670 UnicodeSet *fAL; 2671 UnicodeSet *fCJ; 2672 UnicodeSet *fHL; 2673 UnicodeSet *fID; 2674 UnicodeSet *fRI; 2675 UnicodeSet *fSA; 2676 UnicodeSet *fXX; 2677 2678 BreakIterator *fCharBI; 2679 2680 const UnicodeString *fText; 2681 int32_t *fOrigPositions; 2682 2683 RegexMatcher *fNumberMatcher; 2684 RegexMatcher *fLB11Matcher; 2685 }; 2686 2687 2688 RBBILineMonkey::RBBILineMonkey() 2689 { 2690 UErrorCode status = U_ZERO_ERROR; 2691 2692 fSets = new UVector(status); 2693 2694 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status); 2695 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status); 2696 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status); 2697 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status); 2698 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status); 2699 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status); 2700 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status); 2701 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status); 2702 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status); 2703 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status); 2704 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status); 2705 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status); 2706 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status); 2707 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status); 2708 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status); 2709 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status); 2710 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status); 2711 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status); 2712 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status); 2713 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status); 2714 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status); 2715 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status); 2716 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status); 2717 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status); 2718 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status); 2719 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status); 2720 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status); 2721 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status); 2722 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status); 2723 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status); 2724 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status); 2725 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status); 2726 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status); 2727 fCJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status); 2728 fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status); 2729 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status); 2730 fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status); 2731 fSA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status); 2732 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status); 2733 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status); 2734 2735 if (U_FAILURE(status)) { 2736 deferredStatus = status; 2737 fCharBI = NULL; 2738 fNumberMatcher = NULL; 2739 return; 2740 } 2741 2742 fAL->addAll(*fXX); // Default behavior for XX is identical to AL 2743 fAL->addAll(*fAI); // Default behavior for AI is identical to AL 2744 fAL->addAll(*fSA); // Default behavior for SA is XX, which defaults to AL 2745 fAL->addAll(*fSG); // Default behavior for SG is identical to AL. 2746 2747 fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS. 2748 2749 fSets->addElement(fBK, status); 2750 fSets->addElement(fCR, status); 2751 fSets->addElement(fLF, status); 2752 fSets->addElement(fCM, status); 2753 fSets->addElement(fNL, status); 2754 fSets->addElement(fWJ, status); 2755 fSets->addElement(fZW, status); 2756 fSets->addElement(fGL, status); 2757 fSets->addElement(fCB, status); 2758 fSets->addElement(fSP, status); 2759 fSets->addElement(fB2, status); 2760 fSets->addElement(fBA, status); 2761 fSets->addElement(fBB, status); 2762 fSets->addElement(fHY, status); 2763 fSets->addElement(fH2, status); 2764 fSets->addElement(fH3, status); 2765 fSets->addElement(fCL, status); 2766 fSets->addElement(fCP, status); 2767 fSets->addElement(fEX, status); 2768 fSets->addElement(fIN, status); 2769 fSets->addElement(fJL, status); 2770 fSets->addElement(fJT, status); 2771 fSets->addElement(fJV, status); 2772 fSets->addElement(fNS, status); 2773 fSets->addElement(fOP, status); 2774 fSets->addElement(fQU, status); 2775 fSets->addElement(fIS, status); 2776 fSets->addElement(fNU, status); 2777 fSets->addElement(fPO, status); 2778 fSets->addElement(fPR, status); 2779 fSets->addElement(fSY, status); 2780 fSets->addElement(fAI, status); 2781 fSets->addElement(fAL, status); 2782 fSets->addElement(fHL, status); 2783 fSets->addElement(fID, status); 2784 fSets->addElement(fWJ, status); 2785 fSets->addElement(fRI, status); 2786 fSets->addElement(fSA, status); 2787 fSets->addElement(fSG, status); 2788 2789 const char *rules = 2790 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?" 2791 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?" 2792 "\\p{Line_Break=NU}\\p{Line_Break=CM}*" 2793 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*" 2794 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?" 2795 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"; 2796 2797 fNumberMatcher = new RegexMatcher( 2798 UnicodeString(rules, -1, US_INV), 0, status); 2799 2800 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status); 2801 2802 if (U_FAILURE(status)) { 2803 deferredStatus = status; 2804 } 2805 } 2806 2807 2808 void RBBILineMonkey::setText(const UnicodeString &s) { 2809 fText = &s; 2810 fCharBI->setText(s); 2811 fNumberMatcher->reset(s); 2812 } 2813 2814 // 2815 // rule9Adjust 2816 // Line Break TR rules 9 and 10 implementation. 2817 // This deals with combining marks and other sequences that 2818 // that must be treated as if they were something other than what they actually are. 2819 // 2820 // This is factored out into a separate function because it must be applied twice for 2821 // each potential break, once to the chars before the position being checked, then 2822 // again to the text following the possible break. 2823 // 2824 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) { 2825 if (pos == -1) { 2826 // Invalid initial position. Happens during the warmup iteration of the 2827 // main loop in next(). 2828 return; 2829 } 2830 2831 int32_t nPos = *nextPos; 2832 2833 // LB 9 Keep combining sequences together. 2834 // advance over any CM class chars. Note that Line Break CM is different 2835 // from the normal Grapheme Extend property. 2836 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d || 2837 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) { 2838 for (;;) { 2839 *nextChar = fText->char32At(nPos); 2840 if (!fCM->contains(*nextChar)) { 2841 break; 2842 } 2843 nPos = fText->moveIndex32(nPos, 1); 2844 } 2845 } 2846 2847 2848 // LB 9 Treat X CM* as if it were x. 2849 // No explicit action required. 2850 2851 // LB 10 Treat any remaining combining mark as AL 2852 if (fCM->contains(*posChar)) { 2853 *posChar = 0x41; // thisChar = 'A'; 2854 } 2855 2856 // Push the updated nextPos and nextChar back to our caller. 2857 // This only makes a difference if posChar got bigger by consuming a 2858 // combining sequence. 2859 *nextPos = nPos; 2860 *nextChar = fText->char32At(nPos); 2861 } 2862 2863 2864 2865 int32_t RBBILineMonkey::next(int32_t startPos) { 2866 UErrorCode status = U_ZERO_ERROR; 2867 int32_t pos; // Index of the char following a potential break position 2868 UChar32 thisChar; // Character at above position "pos" 2869 2870 int32_t prevPos; // Index of the char preceding a potential break position 2871 UChar32 prevChar; // Character at above position. Note that prevChar 2872 // and thisChar may not be adjacent because combining 2873 // characters between them will be ignored. 2874 2875 int32_t prevPosX2; // Second previous character. Wider context for LB21a. 2876 UChar32 prevCharX2; 2877 2878 int32_t nextPos; // Index of the next character following pos. 2879 // Usually skips over combining marks. 2880 int32_t nextCPPos; // Index of the code point following "pos." 2881 // May point to a combining mark. 2882 int32_t tPos; // temp value. 2883 UChar32 c; 2884 2885 if (U_FAILURE(deferredStatus)) { 2886 return -1; 2887 } 2888 2889 if (startPos >= fText->length()) { 2890 return -1; 2891 } 2892 2893 2894 // Initial values for loop. Loop will run the first time without finding breaks, 2895 // while the invalid values shift out and the "this" and 2896 // "prev" positions are filled in with good values. 2897 pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration. 2898 thisChar = prevChar = prevCharX2 = 0; 2899 nextPos = nextCPPos = startPos; 2900 2901 2902 // Loop runs once per position in the test text, until a break position 2903 // is found. 2904 for (;;) { 2905 prevPosX2 = prevPos; 2906 prevCharX2 = prevChar; 2907 2908 prevPos = pos; 2909 prevChar = thisChar; 2910 2911 pos = nextPos; 2912 thisChar = fText->char32At(pos); 2913 2914 nextCPPos = fText->moveIndex32(pos, 1); 2915 nextPos = nextCPPos; 2916 2917 // Rule LB2 - Break at end of text. 2918 if (pos >= fText->length()) { 2919 break; 2920 } 2921 2922 // Rule LB 9 - adjust for combining sequences. 2923 // We do this one out-of-order because the adjustment does not change anything 2924 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to 2925 // be applied. 2926 rule9Adjust(prevPos, &prevChar, &pos, &thisChar); 2927 nextCPPos = nextPos = fText->moveIndex32(pos, 1); 2928 c = fText->char32At(nextPos); 2929 rule9Adjust(pos, &thisChar, &nextPos, &c); 2930 2931 // If the loop is still warming up - if we haven't shifted the initial 2932 // -1 positions out of prevPos yet - loop back to advance the 2933 // position in the input without any further looking for breaks. 2934 if (prevPos == -1) { 2935 continue; 2936 } 2937 2938 // LB 4 Always break after hard line breaks, 2939 if (fBK->contains(prevChar)) { 2940 break; 2941 } 2942 2943 // LB 5 Break after CR, LF, NL, but not inside CR LF 2944 if (prevChar == 0x0d && thisChar == 0x0a) { 2945 continue; 2946 } 2947 if (prevChar == 0x0d || 2948 prevChar == 0x0a || 2949 prevChar == 0x85) { 2950 break; 2951 } 2952 2953 // LB 6 Don't break before hard line breaks 2954 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 || 2955 fBK->contains(thisChar)) { 2956 continue; 2957 } 2958 2959 2960 // LB 7 Don't break before spaces or zero-width space. 2961 if (fSP->contains(thisChar)) { 2962 continue; 2963 } 2964 2965 if (fZW->contains(thisChar)) { 2966 continue; 2967 } 2968 2969 // LB 8 Break after zero width space 2970 if (fZW->contains(prevChar)) { 2971 break; 2972 } 2973 2974 // LB 9, 10 Already done, at top of loop. 2975 // 2976 2977 2978 // LB 11 Do not break before or after WORD JOINER and related characters. 2979 // x WJ 2980 // WJ x 2981 // 2982 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) { 2983 continue; 2984 } 2985 2986 // LB 12 2987 // GL x 2988 if (fGL->contains(prevChar)) { 2989 continue; 2990 } 2991 2992 // LB 12a 2993 // [^SP BA HY] x GL 2994 if (!(fSP->contains(prevChar) || 2995 fBA->contains(prevChar) || 2996 fHY->contains(prevChar) ) && fGL->contains(thisChar)) { 2997 continue; 2998 } 2999 3000 3001 3002 // LB 13 Don't break before closings. 3003 // NU x CL, NU x CP and NU x IS are not matched here so that they will 3004 // fall into LB 17 and the more general number regular expression. 3005 // 3006 if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) || 3007 (!fNU->contains(prevChar) && fCP->contains(thisChar)) || 3008 fEX->contains(thisChar) || 3009 (!fNU->contains(prevChar) && fIS->contains(thisChar)) || 3010 (!fNU->contains(prevChar) && fSY->contains(thisChar))) { 3011 continue; 3012 } 3013 3014 // LB 14 Don't break after OP SP* 3015 // Scan backwards, checking for this sequence. 3016 // The OP char could include combining marks, so we actually check for 3017 // OP CM* SP* 3018 // Another Twist: The Rule 67 fixes may have changed a SP CM 3019 // sequence into a ID char, so before scanning back through spaces, 3020 // verify that prevChar is indeed a space. The prevChar variable 3021 // may differ from fText[prevPos] 3022 tPos = prevPos; 3023 if (fSP->contains(prevChar)) { 3024 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) { 3025 tPos=fText->moveIndex32(tPos, -1); 3026 } 3027 } 3028 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { 3029 tPos=fText->moveIndex32(tPos, -1); 3030 } 3031 if (fOP->contains(fText->char32At(tPos))) { 3032 continue; 3033 } 3034 3035 3036 // LB 15 QU SP* x OP 3037 if (fOP->contains(thisChar)) { 3038 // Scan backwards from prevChar to see if it is preceded by QU CM* SP* 3039 int tPos = prevPos; 3040 while (tPos>0 && fSP->contains(fText->char32At(tPos))) { 3041 tPos = fText->moveIndex32(tPos, -1); 3042 } 3043 while (tPos>0 && fCM->contains(fText->char32At(tPos))) { 3044 tPos = fText->moveIndex32(tPos, -1); 3045 } 3046 if (fQU->contains(fText->char32At(tPos))) { 3047 continue; 3048 } 3049 } 3050 3051 3052 3053 // LB 16 (CL | CP) SP* x NS 3054 // Scan backwards for SP* CM* (CL | CP) 3055 if (fNS->contains(thisChar)) { 3056 int tPos = prevPos; 3057 while (tPos>0 && fSP->contains(fText->char32At(tPos))) { 3058 tPos = fText->moveIndex32(tPos, -1); 3059 } 3060 while (tPos>0 && fCM->contains(fText->char32At(tPos))) { 3061 tPos = fText->moveIndex32(tPos, -1); 3062 } 3063 if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) { 3064 continue; 3065 } 3066 } 3067 3068 3069 // LB 17 B2 SP* x B2 3070 if (fB2->contains(thisChar)) { 3071 // Scan backwards, checking for the B2 CM* SP* sequence. 3072 tPos = prevPos; 3073 if (fSP->contains(prevChar)) { 3074 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) { 3075 tPos=fText->moveIndex32(tPos, -1); 3076 } 3077 } 3078 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { 3079 tPos=fText->moveIndex32(tPos, -1); 3080 } 3081 if (fB2->contains(fText->char32At(tPos))) { 3082 continue; 3083 } 3084 } 3085 3086 3087 // LB 18 break after space 3088 if (fSP->contains(prevChar)) { 3089 break; 3090 } 3091 3092 // LB 19 3093 // x QU 3094 // QU x 3095 if (fQU->contains(thisChar) || fQU->contains(prevChar)) { 3096 continue; 3097 } 3098 3099 // LB 20 Break around a CB 3100 if (fCB->contains(thisChar) || fCB->contains(prevChar)) { 3101 break; 3102 } 3103 3104 // LB 21 3105 if (fBA->contains(thisChar) || 3106 fHY->contains(thisChar) || 3107 fNS->contains(thisChar) || 3108 fBB->contains(prevChar) ) { 3109 continue; 3110 } 3111 3112 // LB 21a 3113 // HL (HY | BA) x 3114 if (fHL->contains(prevCharX2) && 3115 (fHY->contains(prevChar) || fBA->contains(prevChar))) { 3116 continue; 3117 } 3118 3119 // LB 22 3120 if ((fAL->contains(prevChar) && fIN->contains(thisChar)) || 3121 (fHL->contains(prevChar) && fIN->contains(thisChar)) || 3122 (fID->contains(prevChar) && fIN->contains(thisChar)) || 3123 (fIN->contains(prevChar) && fIN->contains(thisChar)) || 3124 (fNU->contains(prevChar) && fIN->contains(thisChar)) ) { 3125 continue; 3126 } 3127 3128 3129 // LB 23 ID x PO 3130 // AL x NU 3131 // HL x NU 3132 // NU x AL 3133 if ((fID->contains(prevChar) && fPO->contains(thisChar)) || 3134 (fAL->contains(prevChar) && fNU->contains(thisChar)) || 3135 (fHL->contains(prevChar) && fNU->contains(thisChar)) || 3136 (fNU->contains(prevChar) && fAL->contains(thisChar)) || 3137 (fNU->contains(prevChar) && fHL->contains(thisChar)) ) { 3138 continue; 3139 } 3140 3141 // LB 24 Do not break between prefix and letters or ideographs. 3142 // PR x ID 3143 // PR x (AL | HL) 3144 // PO x (AL | HL) 3145 if ((fPR->contains(prevChar) && fID->contains(thisChar)) || 3146 (fPR->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) || 3147 (fPO->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar)))) { 3148 continue; 3149 } 3150 3151 3152 3153 // LB 25 Numbers 3154 if (fNumberMatcher->lookingAt(prevPos, status)) { 3155 if (U_FAILURE(status)) { 3156 break; 3157 } 3158 // Matched a number. But could have been just a single digit, which would 3159 // not represent a "no break here" between prevChar and thisChar 3160 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num 3161 if (numEndIdx > pos) { 3162 // Number match includes at least our two chars being checked 3163 if (numEndIdx > nextPos) { 3164 // Number match includes additional chars. Update pos and nextPos 3165 // so that next loop iteration will continue at the end of the number, 3166 // checking for breaks between last char in number & whatever follows. 3167 pos = nextPos = numEndIdx; 3168 do { 3169 pos = fText->moveIndex32(pos, -1); 3170 thisChar = fText->char32At(pos); 3171 } while (fCM->contains(thisChar)); 3172 } 3173 continue; 3174 } 3175 } 3176 3177 3178 // LB 26 Do not break a Korean syllable. 3179 if (fJL->contains(prevChar) && (fJL->contains(thisChar) || 3180 fJV->contains(thisChar) || 3181 fH2->contains(thisChar) || 3182 fH3->contains(thisChar))) { 3183 continue; 3184 } 3185 3186 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) && 3187 (fJV->contains(thisChar) || fJT->contains(thisChar))) { 3188 continue; 3189 } 3190 3191 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) && 3192 fJT->contains(thisChar)) { 3193 continue; 3194 } 3195 3196 // LB 27 Treat a Korean Syllable Block the same as ID. 3197 if ((fJL->contains(prevChar) || fJV->contains(prevChar) || 3198 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) && 3199 fIN->contains(thisChar)) { 3200 continue; 3201 } 3202 if ((fJL->contains(prevChar) || fJV->contains(prevChar) || 3203 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) && 3204 fPO->contains(thisChar)) { 3205 continue; 3206 } 3207 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) || 3208 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) { 3209 continue; 3210 } 3211 3212 3213 3214 // LB 28 Do not break between alphabetics ("at"). 3215 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) { 3216 continue; 3217 } 3218 3219 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g."). 3220 if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) { 3221 continue; 3222 } 3223 3224 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation. 3225 // (AL | NU) x OP 3226 // CP x (AL | NU) 3227 if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) { 3228 continue; 3229 } 3230 if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) { 3231 continue; 3232 } 3233 3234 // LB30a Do not break between regional indicators. 3235 // RI x RI 3236 if (fRI->contains(prevChar) && fRI->contains(thisChar)) { 3237 continue; 3238 } 3239 3240 // LB 31 Break everywhere else 3241 break; 3242 3243 } 3244 3245 return pos; 3246 } 3247 3248 3249 UVector *RBBILineMonkey::charClasses() { 3250 return fSets; 3251 } 3252 3253 3254 RBBILineMonkey::~RBBILineMonkey() { 3255 delete fSets; 3256 3257 delete fBK; 3258 delete fCR; 3259 delete fLF; 3260 delete fCM; 3261 delete fNL; 3262 delete fWJ; 3263 delete fZW; 3264 delete fGL; 3265 delete fCB; 3266 delete fSP; 3267 delete fB2; 3268 delete fBA; 3269 delete fBB; 3270 delete fHY; 3271 delete fH2; 3272 delete fH3; 3273 delete fCL; 3274 delete fCP; 3275 delete fEX; 3276 delete fIN; 3277 delete fJL; 3278 delete fJV; 3279 delete fJT; 3280 delete fNS; 3281 delete fOP; 3282 delete fQU; 3283 delete fIS; 3284 delete fNU; 3285 delete fPO; 3286 delete fPR; 3287 delete fSY; 3288 delete fAI; 3289 delete fAL; 3290 delete fCJ; 3291 delete fHL; 3292 delete fID; 3293 delete fRI; 3294 delete fSA; 3295 delete fSG; 3296 delete fXX; 3297 3298 delete fCharBI; 3299 delete fNumberMatcher; 3300 } 3301 3302 3303 //------------------------------------------------------------------------------------------- 3304 // 3305 // TestMonkey 3306 // 3307 // params 3308 // seed=nnnnn Random number starting seed. 3309 // Setting the seed allows errors to be reproduced. 3310 // loop=nnn Looping count. Controls running time. 3311 // -1: run forever. 3312 // 0 or greater: run length. 3313 // 3314 // type = char | word | line | sent | title 3315 // 3316 //------------------------------------------------------------------------------------------- 3317 3318 static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) { 3319 int32_t val = defaultVal; 3320 name.append(" *= *(-?\\d+)"); 3321 UErrorCode status = U_ZERO_ERROR; 3322 RegexMatcher m(name, params, 0, status); 3323 if (m.find()) { 3324 // The param exists. Convert the string to an int. 3325 char valString[100]; 3326 int32_t paramLength = m.end(1, status) - m.start(1, status); 3327 if (paramLength >= (int32_t)(sizeof(valString)-1)) { 3328 paramLength = (int32_t)(sizeof(valString)-2); 3329 } 3330 params.extract(m.start(1, status), paramLength, valString, sizeof(valString)); 3331 val = strtol(valString, NULL, 10); 3332 3333 // Delete this parameter from the params string. 3334 m.reset(); 3335 params = m.replaceFirst("", status); 3336 } 3337 U_ASSERT(U_SUCCESS(status)); 3338 return val; 3339 } 3340 #endif 3341 3342 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 3343 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr, 3344 BreakIterator *bi, 3345 int expected[], 3346 int expectedcount) 3347 { 3348 int count = 0; 3349 int i = 0; 3350 int forward[50]; 3351 bi->setText(ustr); 3352 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) { 3353 forward[count] = i; 3354 if (count < expectedcount && expected[count] != i) { 3355 test->errln("break forward test failed: expected %d but got %d", 3356 expected[count], i); 3357 break; 3358 } 3359 count ++; 3360 } 3361 if (count != expectedcount) { 3362 printStringBreaks(ustr, expected, expectedcount); 3363 test->errln("break forward test failed: missed %d match", 3364 expectedcount - count); 3365 return; 3366 } 3367 // testing boundaries 3368 for (i = 1; i < expectedcount; i ++) { 3369 int j = expected[i - 1]; 3370 if (!bi->isBoundary(j)) { 3371 printStringBreaks(ustr, expected, expectedcount); 3372 test->errln("isBoundary() failed. Expected boundary at position %d", j); 3373 return; 3374 } 3375 for (j = expected[i - 1] + 1; j < expected[i]; j ++) { 3376 if (bi->isBoundary(j)) { 3377 printStringBreaks(ustr, expected, expectedcount); 3378 test->errln("isBoundary() failed. Not expecting boundary at position %d", j); 3379 return; 3380 } 3381 } 3382 } 3383 3384 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) { 3385 count --; 3386 if (forward[count] != i) { 3387 printStringBreaks(ustr, expected, expectedcount); 3388 test->errln("happy break test previous() failed: expected %d but got %d", 3389 forward[count], i); 3390 break; 3391 } 3392 } 3393 if (count != 0) { 3394 printStringBreaks(ustr, expected, expectedcount); 3395 test->errln("break test previous() failed: missed a match"); 3396 return; 3397 } 3398 3399 // testing preceding 3400 for (i = 0; i < expectedcount - 1; i ++) { 3401 // int j = expected[i] + 1; 3402 int j = ustr.moveIndex32(expected[i], 1); 3403 for (; j <= expected[i + 1]; j ++) { 3404 if (bi->preceding(j) != expected[i]) { 3405 printStringBreaks(ustr, expected, expectedcount); 3406 test->errln("preceding(): Not expecting boundary at position %d", j); 3407 return; 3408 } 3409 } 3410 } 3411 } 3412 #endif 3413 3414 void RBBITest::TestWordBreaks(void) 3415 { 3416 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 3417 3418 Locale locale("en"); 3419 UErrorCode status = U_ZERO_ERROR; 3420 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 3421 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 3422 // Replaced any C+J characters in a row with a random sequence of characters 3423 // of the same length to make our C+J segmentation not get in the way. 3424 static const char *strlist[] = 3425 { 3426 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d", 3427 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b", 3428 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a", 3429 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622", 3430 "\\uac00\\u3588\\u009c\\u0953\\u194b", 3431 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 3432 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e", 3433 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e", 3434 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 3435 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 3436 "\\u2027\\U000e0067\\u0a47\\u00b7", 3437 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 3438 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 3439 "\\u0589\\U000e006e\\u0a42\\U000104a5", 3440 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a", 3441 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 3442 "\\u0027\\u11af\\U000e0057\\u0602", 3443 "\\U0001d7f2\\U000e007\\u0004\\u0589", 3444 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b", 3445 "\\U0001d7f2\\U000e007d\\u0004\\u0589", 3446 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 3447 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 3448 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068", 3449 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 3450 "\\u0233\\U000e0020\\u0a69\\u0d6a", 3451 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 3452 "\\u18f4\\U000e0049\\u20e7\\u2027", 3453 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 3454 "\\ua183\\u102d\\u0bec\\u003a", 3455 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 3456 "\\u003a\\u0e57\\u0fad\\u002e", 3457 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", 3458 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", 3459 "\\U000e005d\\u2044\\u0731\\u0650\\u0061", 3460 "\\u003a\\u0664\\u00b7\\u1fba", 3461 "\\u003b\\u0027\\u00b7\\u47a3", 3462 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b", 3463 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673", 3464 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c", 3465 }; 3466 int loop; 3467 if (U_FAILURE(status)) { 3468 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 3469 return; 3470 } 3471 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 3472 // printf("looping %d\n", loop); 3473 UnicodeString ustr = CharsToUnicodeString(strlist[loop]); 3474 // RBBICharMonkey monkey; 3475 RBBIWordMonkey monkey; 3476 3477 int expected[50]; 3478 int expectedcount = 0; 3479 3480 monkey.setText(ustr); 3481 int i; 3482 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 3483 expected[expectedcount ++] = i; 3484 } 3485 3486 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 3487 } 3488 delete bi; 3489 #endif 3490 } 3491 3492 void RBBITest::TestWordBoundary(void) 3493 { 3494 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data> 3495 Locale locale("en"); 3496 UErrorCode status = U_ZERO_ERROR; 3497 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 3498 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 3499 UChar str[50]; 3500 static const char *strlist[] = 3501 { 3502 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 3503 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 3504 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 3505 "\\u2027\\U000e0067\\u0a47\\u00b7", 3506 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 3507 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 3508 "\\u0589\\U000e006e\\u0a42\\U000104a5", 3509 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a", 3510 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 3511 "\\u0027\\u11af\\U000e0057\\u0602", 3512 "\\U0001d7f2\\U000e007\\u0004\\u0589", 3513 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b", 3514 "\\U0001d7f2\\U000e007d\\u0004\\u0589", 3515 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 3516 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 3517 "\\U000e0065\\u302c\\u09ee\\U000e0068", 3518 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 3519 "\\u0233\\U000e0020\\u0a69\\u0d6a", 3520 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 3521 "\\u58f4\\U000e0049\\u20e7\\u2027", 3522 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 3523 "\\ua183\\u102d\\u0bec\\u003a", 3524 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 3525 "\\u003a\\u0e57\\u0fad\\u002e", 3526 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", 3527 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", 3528 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019", 3529 "\\u003a\\u0664\\u00b7\\u1fba", 3530 "\\u003b\\u0027\\u00b7\\u47a3", 3531 }; 3532 int loop; 3533 if (U_FAILURE(status)) { 3534 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 3535 return; 3536 } 3537 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 3538 // printf("looping %d\n", loop); 3539 u_unescape(strlist[loop], str, 20); 3540 UnicodeString ustr(str); 3541 int forward[50]; 3542 int count = 0; 3543 3544 bi->setText(ustr); 3545 int prev = 0; 3546 int i; 3547 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) { 3548 forward[count ++] = i; 3549 if (i > prev) { 3550 int j; 3551 for (j = prev + 1; j < i; j ++) { 3552 if (bi->isBoundary(j)) { 3553 printStringBreaks(ustr, forward, count); 3554 errln("happy boundary test failed: expected %d not a boundary", 3555 j); 3556 return; 3557 } 3558 } 3559 } 3560 if (!bi->isBoundary(i)) { 3561 printStringBreaks(ustr, forward, count); 3562 errln("happy boundary test failed: expected %d a boundary", 3563 i); 3564 return; 3565 } 3566 prev = i; 3567 } 3568 } 3569 delete bi; 3570 } 3571 3572 void RBBITest::TestLineBreaks(void) 3573 { 3574 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 3575 Locale locale("en"); 3576 UErrorCode status = U_ZERO_ERROR; 3577 BreakIterator *bi = BreakIterator::createLineInstance(locale, status); 3578 const int32_t STRSIZE = 50; 3579 UChar str[STRSIZE]; 3580 static const char *strlist[] = 3581 { 3582 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc", 3583 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\" 3584 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d", 3585 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\" 3586 "u2014\\U000e0105\\u118c\\u000a\\u07f8", 3587 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f", 3588 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", 3589 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4", 3590 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", 3591 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060", 3592 "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5", 3593 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f", 3594 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1", 3595 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5", 3596 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0", 3597 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc", 3598 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f", 3599 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f", 3600 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b", 3601 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085", 3602 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac", 3603 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9", 3604 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025", 3605 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763", 3606 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029", 3607 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7", 3608 "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc", 3609 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a", 3610 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945", 3611 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014", 3612 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b", 3613 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0", 3614 "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025", 3615 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d", 3616 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111", 3617 "\\u2014\\u0020\\u000a\\u17c5\\u24fc", 3618 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f", 3619 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010", 3620 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43", 3621 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb", 3622 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc", 3623 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060", 3624 "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d" 3625 "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5" 3626 "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b", 3627 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0", 3628 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07", 3629 }; 3630 int loop; 3631 TEST_ASSERT_SUCCESS(status); 3632 if (U_FAILURE(status)) { 3633 return; 3634 } 3635 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 3636 // printf("looping %d\n", loop); 3637 int32_t t = u_unescape(strlist[loop], str, STRSIZE); 3638 if (t >= STRSIZE) { 3639 TEST_ASSERT(FALSE); 3640 continue; 3641 } 3642 3643 3644 UnicodeString ustr(str); 3645 RBBILineMonkey monkey; 3646 if (U_FAILURE(monkey.deferredStatus)) { 3647 continue; 3648 } 3649 3650 const int EXPECTEDSIZE = 50; 3651 int expected[EXPECTEDSIZE]; 3652 int expectedcount = 0; 3653 3654 monkey.setText(ustr); 3655 int i; 3656 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 3657 if (expectedcount >= EXPECTEDSIZE) { 3658 TEST_ASSERT(expectedcount < EXPECTEDSIZE); 3659 return; 3660 } 3661 expected[expectedcount ++] = i; 3662 } 3663 3664 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 3665 } 3666 delete bi; 3667 #endif 3668 } 3669 3670 void RBBITest::TestSentBreaks(void) 3671 { 3672 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 3673 Locale locale("en"); 3674 UErrorCode status = U_ZERO_ERROR; 3675 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status); 3676 UChar str[200]; 3677 static const char *strlist[] = 3678 { 3679 "Now\ris\nthe\r\ntime\n\rfor\r\r", 3680 "This\n", 3681 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.", 3682 "\"Sentence ending with a quote.\" Bye.", 3683 " (This is it). Testing the sentence iterator. \"This isn't it.\"", 3684 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"", 3685 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ", 3686 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ", 3687 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ", 3688 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!", 3689 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52" 3690 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a" 3691 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f" 3692 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352", 3693 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171" 3694 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030" 3695 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b" 3696 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b" 3697 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05" 3698 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4" 3699 }; 3700 int loop; 3701 if (U_FAILURE(status)) { 3702 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 3703 return; 3704 } 3705 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 3706 u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0]))); 3707 UnicodeString ustr(str); 3708 3709 RBBISentMonkey monkey; 3710 if (U_FAILURE(monkey.deferredStatus)) { 3711 continue; 3712 } 3713 3714 const int EXPECTEDSIZE = 50; 3715 int expected[EXPECTEDSIZE]; 3716 int expectedcount = 0; 3717 3718 monkey.setText(ustr); 3719 int i; 3720 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 3721 if (expectedcount >= EXPECTEDSIZE) { 3722 TEST_ASSERT(expectedcount < EXPECTEDSIZE); 3723 return; 3724 } 3725 expected[expectedcount ++] = i; 3726 } 3727 3728 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 3729 } 3730 delete bi; 3731 #endif 3732 } 3733 3734 void RBBITest::TestMonkey(char *params) { 3735 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 3736 3737 UErrorCode status = U_ZERO_ERROR; 3738 int32_t loopCount = 500; 3739 int32_t seed = 1; 3740 UnicodeString breakType = "all"; 3741 Locale locale("en"); 3742 UBool useUText = FALSE; 3743 3744 if (quick == FALSE) { 3745 loopCount = 10000; 3746 } 3747 3748 if (params) { 3749 UnicodeString p(params); 3750 loopCount = getIntParam("loop", p, loopCount); 3751 seed = getIntParam("seed", p, seed); 3752 3753 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status); 3754 if (m.find()) { 3755 breakType = m.group(1, status); 3756 m.reset(); 3757 p = m.replaceFirst("", status); 3758 } 3759 3760 RegexMatcher u(" *utext", p, 0, status); 3761 if (u.find()) { 3762 useUText = TRUE; 3763 u.reset(); 3764 p = u.replaceFirst("", status); 3765 } 3766 3767 3768 // m.reset(p); 3769 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) { 3770 // Each option is stripped out of the option string as it is processed. 3771 // All options have been checked. The option string should have been completely emptied.. 3772 char buf[100]; 3773 p.extract(buf, sizeof(buf), NULL, status); 3774 buf[sizeof(buf)-1] = 0; 3775 errln("Unrecognized or extra parameter: %s\n", buf); 3776 return; 3777 } 3778 3779 } 3780 3781 if (breakType == "char" || breakType == "all") { 3782 RBBICharMonkey m; 3783 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 3784 if (U_SUCCESS(status)) { 3785 RunMonkey(bi, m, "char", seed, loopCount, useUText); 3786 if (breakType == "all" && useUText==FALSE) { 3787 // Also run a quick test with UText when "all" is specified 3788 RunMonkey(bi, m, "char", seed, loopCount, TRUE); 3789 } 3790 } 3791 else { 3792 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status)); 3793 } 3794 delete bi; 3795 } 3796 3797 if (breakType == "word" || breakType == "all") { 3798 logln("Word Break Monkey Test"); 3799 RBBIWordMonkey m; 3800 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 3801 if (U_SUCCESS(status)) { 3802 RunMonkey(bi, m, "word", seed, loopCount, useUText); 3803 } 3804 else { 3805 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status)); 3806 } 3807 delete bi; 3808 } 3809 3810 if (breakType == "line" || breakType == "all") { 3811 logln("Line Break Monkey Test"); 3812 RBBILineMonkey m; 3813 BreakIterator *bi = BreakIterator::createLineInstance(locale, status); 3814 if (loopCount >= 10) { 3815 loopCount = loopCount / 5; // Line break runs slower than the others. 3816 } 3817 if (U_SUCCESS(status)) { 3818 RunMonkey(bi, m, "line", seed, loopCount, useUText); 3819 } 3820 else { 3821 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); 3822 } 3823 delete bi; 3824 } 3825 3826 if (breakType == "sent" || breakType == "all" ) { 3827 logln("Sentence Break Monkey Test"); 3828 RBBISentMonkey m; 3829 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status); 3830 if (loopCount >= 10) { 3831 loopCount = loopCount / 10; // Sentence runs slower than the other break types 3832 } 3833 if (U_SUCCESS(status)) { 3834 RunMonkey(bi, m, "sentence", seed, loopCount, useUText); 3835 } 3836 else { 3837 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); 3838 } 3839 delete bi; 3840 } 3841 3842 #endif 3843 } 3844 3845 // 3846 // Run a RBBI monkey test. Common routine, for all break iterator types. 3847 // Parameters: 3848 // bi - the break iterator to use 3849 // mk - MonkeyKind, abstraction for obtaining expected results 3850 // name - Name of test (char, word, etc.) for use in error messages 3851 // seed - Seed for starting random number generator (parameter from user) 3852 // numIterations 3853 // 3854 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed, 3855 int32_t numIterations, UBool useUText) { 3856 3857 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 3858 3859 const int32_t TESTSTRINGLEN = 500; 3860 UnicodeString testText; 3861 int32_t numCharClasses; 3862 UVector *chClasses; 3863 int expected[TESTSTRINGLEN*2 + 1]; 3864 int expectedCount = 0; 3865 char expectedBreaks[TESTSTRINGLEN*2 + 1]; 3866 char forwardBreaks[TESTSTRINGLEN*2 + 1]; 3867 char reverseBreaks[TESTSTRINGLEN*2+1]; 3868 char isBoundaryBreaks[TESTSTRINGLEN*2+1]; 3869 char followingBreaks[TESTSTRINGLEN*2+1]; 3870 char precedingBreaks[TESTSTRINGLEN*2+1]; 3871 int i; 3872 int loopCount = 0; 3873 3874 m_seed = seed; 3875 3876 numCharClasses = mk.charClasses()->size(); 3877 chClasses = mk.charClasses(); 3878 3879 // Check for errors that occured during the construction of the MonkeyKind object. 3880 // Can't report them where they occured because errln() is a method coming from intlTest, 3881 // and is not visible outside of RBBITest :-( 3882 if (U_FAILURE(mk.deferredStatus)) { 3883 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus)); 3884 return; 3885 } 3886 3887 // Verify that the character classes all have at least one member. 3888 for (i=0; i<numCharClasses; i++) { 3889 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i); 3890 if (s == NULL || s->size() == 0) { 3891 errln("Character Class #%d is null or of zero size.", i); 3892 return; 3893 } 3894 } 3895 3896 while (loopCount < numIterations || numIterations == -1) { 3897 if (numIterations == -1 && loopCount % 10 == 0) { 3898 // If test is running in an infinite loop, display a periodic tic so 3899 // we can tell that it is making progress. 3900 fprintf(stderr, "."); 3901 } 3902 // Save current random number seed, so that we can recreate the random numbers 3903 // for this loop iteration in event of an error. 3904 seed = m_seed; 3905 3906 // Populate a test string with data. 3907 testText.truncate(0); 3908 for (i=0; i<TESTSTRINGLEN; i++) { 3909 int32_t aClassNum = m_rand() % numCharClasses; 3910 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum); 3911 int32_t charIdx = m_rand() % classSet->size(); 3912 UChar32 c = classSet->charAt(charIdx); 3913 if (c < 0) { // TODO: deal with sets containing strings. 3914 errln("c < 0"); 3915 break; 3916 } 3917 testText.append(c); 3918 } 3919 3920 // Calculate the expected results for this test string. 3921 mk.setText(testText); 3922 memset(expectedBreaks, 0, sizeof(expectedBreaks)); 3923 expectedBreaks[0] = 1; 3924 int32_t breakPos = 0; 3925 expectedCount = 0; 3926 for (;;) { 3927 breakPos = mk.next(breakPos); 3928 if (breakPos == -1) { 3929 break; 3930 } 3931 if (breakPos > testText.length()) { 3932 errln("breakPos > testText.length()"); 3933 } 3934 expectedBreaks[breakPos] = 1; 3935 U_ASSERT(expectedCount<testText.length()); 3936 expected[expectedCount ++] = breakPos; 3937 } 3938 3939 // Find the break positions using forward iteration 3940 memset(forwardBreaks, 0, sizeof(forwardBreaks)); 3941 if (useUText) { 3942 UErrorCode status = U_ZERO_ERROR; 3943 UText *testUText = utext_openReplaceable(NULL, &testText, &status); 3944 // testUText = utext_openUnicodeString(testUText, &testText, &status); 3945 bi->setText(testUText, status); 3946 TEST_ASSERT_SUCCESS(status); 3947 utext_close(testUText); // The break iterator does a shallow clone of the UText 3948 // This UText can be closed immediately, so long as the 3949 // testText string continues to exist. 3950 } else { 3951 bi->setText(testText); 3952 } 3953 3954 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) { 3955 if (i < 0 || i > testText.length()) { 3956 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name); 3957 break; 3958 } 3959 forwardBreaks[i] = 1; 3960 } 3961 3962 // Find the break positions using reverse iteration 3963 memset(reverseBreaks, 0, sizeof(reverseBreaks)); 3964 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) { 3965 if (i < 0 || i > testText.length()) { 3966 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name); 3967 break; 3968 } 3969 reverseBreaks[i] = 1; 3970 } 3971 3972 // Find the break positions using isBoundary() tests. 3973 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks)); 3974 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length()); 3975 for (i=0; i<=testText.length(); i++) { 3976 isBoundaryBreaks[i] = bi->isBoundary(i); 3977 } 3978 3979 3980 // Find the break positions using the following() function. 3981 // printf("."); 3982 memset(followingBreaks, 0, sizeof(followingBreaks)); 3983 int32_t lastBreakPos = 0; 3984 followingBreaks[0] = 1; 3985 for (i=0; i<testText.length(); i++) { 3986 breakPos = bi->following(i); 3987 if (breakPos <= i || 3988 breakPos < lastBreakPos || 3989 breakPos > testText.length() || 3990 (breakPos > lastBreakPos && lastBreakPos > i)) { 3991 errln("%s break monkey test: " 3992 "Out of range value returned by BreakIterator::following().\n" 3993 "Random seed=%d index=%d; following returned %d; lastbreak=%d", 3994 name, seed, i, breakPos, lastBreakPos); 3995 break; 3996 } 3997 followingBreaks[breakPos] = 1; 3998 lastBreakPos = breakPos; 3999 } 4000 4001 // Find the break positions using the preceding() function. 4002 memset(precedingBreaks, 0, sizeof(precedingBreaks)); 4003 lastBreakPos = testText.length(); 4004 precedingBreaks[testText.length()] = 1; 4005 for (i=testText.length(); i>0; i--) { 4006 breakPos = bi->preceding(i); 4007 if (breakPos >= i || 4008 breakPos > lastBreakPos || 4009 (breakPos < 0 && testText.getChar32Start(i)>0) || 4010 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) { 4011 errln("%s break monkey test: " 4012 "Out of range value returned by BreakIterator::preceding().\n" 4013 "index=%d; prev returned %d; lastBreak=%d" , 4014 name, i, breakPos, lastBreakPos); 4015 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) { 4016 precedingBreaks[i] = 2; // Forces an error. 4017 } 4018 } else { 4019 if (breakPos >= 0) { 4020 precedingBreaks[breakPos] = 1; 4021 } 4022 lastBreakPos = breakPos; 4023 } 4024 } 4025 4026 // Compare the expected and actual results. 4027 for (i=0; i<=testText.length(); i++) { 4028 const char *errorType = NULL; 4029 if (forwardBreaks[i] != expectedBreaks[i]) { 4030 errorType = "next()"; 4031 } else if (reverseBreaks[i] != forwardBreaks[i]) { 4032 errorType = "previous()"; 4033 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) { 4034 errorType = "isBoundary()"; 4035 } else if (followingBreaks[i] != expectedBreaks[i]) { 4036 errorType = "following()"; 4037 } else if (precedingBreaks[i] != expectedBreaks[i]) { 4038 errorType = "preceding()"; 4039 } 4040 4041 4042 if (errorType != NULL) { 4043 // Format a range of the test text that includes the failure as 4044 // a data item that can be included in the rbbi test data file. 4045 4046 // Start of the range is the last point where expected and actual results 4047 // both agreed that there was a break position. 4048 int startContext = i; 4049 int32_t count = 0; 4050 for (;;) { 4051 if (startContext==0) { break; } 4052 startContext --; 4053 if (expectedBreaks[startContext] != 0) { 4054 if (count == 2) break; 4055 count ++; 4056 } 4057 } 4058 4059 // End of range is two expected breaks past the start position. 4060 int endContext = i + 1; 4061 int ci; 4062 for (ci=0; ci<2; ci++) { // Number of items to include in error text. 4063 for (;;) { 4064 if (endContext >= testText.length()) {break;} 4065 if (expectedBreaks[endContext-1] != 0) { 4066 if (count == 0) break; 4067 count --; 4068 } 4069 endContext ++; 4070 } 4071 } 4072 4073 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>" 4074 UnicodeString errorText = "<data>"; 4075 /***if (strcmp(errorType, "next()") == 0) { 4076 startContext = 0; 4077 endContext = testText.length(); 4078 4079 printStringBreaks(testText, expected, expectedCount); 4080 }***/ 4081 4082 for (ci=startContext; ci<endContext;) { 4083 UnicodeString hexChars("0123456789abcdef"); 4084 UChar32 c; 4085 int bn; 4086 c = testText.char32At(ci); 4087 if (ci == i) { 4088 // This is the location of the error. 4089 errorText.append("<?>"); 4090 } else if (expectedBreaks[ci] != 0) { 4091 // This a non-error expected break position. 4092 errorText.append("\\"); 4093 } 4094 if (c < 0x10000) { 4095 errorText.append("\\u"); 4096 for (bn=12; bn>=0; bn-=4) { 4097 errorText.append(hexChars.charAt((c>>bn)&0xf)); 4098 } 4099 } else { 4100 errorText.append("\\U"); 4101 for (bn=28; bn>=0; bn-=4) { 4102 errorText.append(hexChars.charAt((c>>bn)&0xf)); 4103 } 4104 } 4105 ci = testText.moveIndex32(ci, 1); 4106 } 4107 errorText.append("\\"); 4108 errorText.append("</data>\n"); 4109 4110 // Output the error 4111 char charErrorTxt[500]; 4112 UErrorCode status = U_ZERO_ERROR; 4113 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status); 4114 charErrorTxt[sizeof(charErrorTxt)-1] = 0; 4115 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status); 4116 4117 errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s", 4118 name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"), 4119 errorType, seed, i, charErrorTxt); 4120 break; 4121 } 4122 } 4123 4124 loopCount++; 4125 } 4126 #endif 4127 } 4128 4129 4130 // Bug 5532. UTF-8 based UText fails in dictionary code. 4131 // This test checks the initial patch, 4132 // which is to just keep it from crashing. Correct word boundaries 4133 // await a proper fix to the dictionary code. 4134 // 4135 void RBBITest::TestBug5532(void) { 4136 // Text includes a mixture of Thai and Latin. 4137 const unsigned char utf8Data[] = { 4138 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u, 4139 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u, 4140 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u, 4141 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 4142 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u, 4143 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u, 4144 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu, 4145 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u, 4146 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 4147 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u, 4148 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00}; 4149 4150 UErrorCode status = U_ZERO_ERROR; 4151 UText utext=UTEXT_INITIALIZER; 4152 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status); 4153 TEST_ASSERT_SUCCESS(status); 4154 4155 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status); 4156 TEST_ASSERT_SUCCESS(status); 4157 if (U_SUCCESS(status)) { 4158 bi->setText(&utext, status); 4159 TEST_ASSERT_SUCCESS(status); 4160 4161 int32_t breakCount = 0; 4162 int32_t previousBreak = -1; 4163 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) { 4164 // For now, just make sure that the break iterator doesn't hang. 4165 TEST_ASSERT(previousBreak < bi->current()); 4166 previousBreak = bi->current(); 4167 } 4168 TEST_ASSERT(breakCount > 0); 4169 } 4170 delete bi; 4171 utext_close(&utext); 4172 } 4173 4174 4175 // 4176 // TestDebug - A place-holder test for debugging purposes. 4177 // For putting in fragments of other tests that can be invoked 4178 // for tracing without a lot of unwanted extra stuff happening. 4179 // 4180 void RBBITest::TestDebug(void) { 4181 #if 0 4182 UErrorCode status = U_ZERO_ERROR; 4183 int pos = 0; 4184 int ruleStatus = 0; 4185 4186 RuleBasedBreakIterator* bi = 4187 // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); 4188 // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status); 4189 (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status); 4190 UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e"); 4191 // UnicodeString s("Aaa. Bcd"); 4192 s = s.unescape(); 4193 bi->setText(s); 4194 UBool r = bi->isBoundary(8); 4195 printf("%s", r?"true":"false"); 4196 return; 4197 pos = bi->last(); 4198 do { 4199 // ruleStatus = bi->getRuleStatus(); 4200 printf("%d\t%d\n", pos, ruleStatus); 4201 pos = bi->previous(); 4202 } while (pos != BreakIterator::DONE); 4203 #endif 4204 } 4205 4206 void RBBITest::TestProperties() { 4207 UErrorCode errorCode = U_ZERO_ERROR; 4208 UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode); 4209 if (!prependSet.isEmpty()) { 4210 errln( 4211 "[:GCB=Prepend:] is not empty any more. " 4212 "Uncomment relevant lines in source/data/brkitr/char.txt and " 4213 "change this test to the opposite condition."); 4214 } 4215 } 4216 4217 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 4218