1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /******************************************************************** 4 * COPYRIGHT: 5 * Copyright (c) 1999-2016, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ********************************************************************/ 8 /************************************************************************ 9 * Date Name Description 10 * 12/15/99 Madhu Creation. 11 * 01/12/2000 Madhu Updated for changed API and added new tests 12 ************************************************************************/ 13 14 #include "unicode/utypes.h" 15 #if !UCONFIG_NO_BREAK_ITERATION 16 17 #include <stdio.h> 18 #include <stdlib.h> 19 #include <string.h> 20 #include <utility> 21 #include <vector> 22 23 #include "unicode/brkiter.h" 24 #include "unicode/localpointer.h" 25 #include "unicode/numfmt.h" 26 #include "unicode/rbbi.h" 27 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 28 #include "unicode/regex.h" 29 #endif 30 #include "unicode/schriter.h" 31 #include "unicode/uchar.h" 32 #include "unicode/utf16.h" 33 #include "unicode/ucnv.h" 34 #include "unicode/uniset.h" 35 #include "unicode/uscript.h" 36 #include "unicode/ustring.h" 37 #include "unicode/utext.h" 38 39 #include "charstr.h" 40 #include "cmemory.h" 41 #include "cstr.h" 42 #include "intltest.h" 43 #include "rbbitst.h" 44 #include "rbbidata.h" 45 #include "utypeinfo.h" // for 'typeid' to work 46 #include "uvector.h" 47 #include "uvectr32.h" 48 49 50 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION 51 #include "unicode/filteredbrk.h" 52 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION 53 54 #define TEST_ASSERT(x) {if (!(x)) { \ 55 errln("Failure in file %s, line %d", __FILE__, __LINE__);}} 56 57 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \ 58 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}} 59 60 //--------------------------------------------- 61 // runIndexedTest 62 //--------------------------------------------- 63 64 65 // Note: Before adding new tests to this file, check whether the desired test data can 66 // simply be added to the file testdata/rbbitest.txt. In most cases it can, 67 // it's much less work than writing a new test, diagnostic output in the event of failures 68 // is good, and the test data file will is shared with ICU4J, so eventually the test 69 // will run there as well, without additional effort. 70 71 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params ) 72 { 73 if (exec) logln("TestSuite RuleBasedBreakIterator: "); 74 fTestParams = params; 75 76 TESTCASE_AUTO_BEGIN; 77 #if !UCONFIG_NO_FILE_IO 78 TESTCASE_AUTO(TestBug4153072); 79 #endif 80 #if !UCONFIG_NO_FILE_IO 81 TESTCASE_AUTO(TestUnicodeFiles); 82 #endif 83 TESTCASE_AUTO(TestGetAvailableLocales); 84 TESTCASE_AUTO(TestGetDisplayName); 85 #if !UCONFIG_NO_FILE_IO 86 TESTCASE_AUTO(TestEndBehaviour); 87 TESTCASE_AUTO(TestWordBreaks); 88 TESTCASE_AUTO(TestWordBoundary); 89 TESTCASE_AUTO(TestLineBreaks); 90 TESTCASE_AUTO(TestSentBreaks); 91 TESTCASE_AUTO(TestExtended); 92 #endif 93 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO 94 TESTCASE_AUTO(TestMonkey); 95 #endif 96 #if !UCONFIG_NO_FILE_IO 97 TESTCASE_AUTO(TestBug3818); 98 #endif 99 TESTCASE_AUTO(TestDebug); 100 #if !UCONFIG_NO_FILE_IO 101 TESTCASE_AUTO(TestBug5775); 102 #endif 103 TESTCASE_AUTO(TestBug9983); 104 TESTCASE_AUTO(TestDictRules); 105 TESTCASE_AUTO(TestBug5532); 106 TESTCASE_AUTO(TestBug7547); 107 TESTCASE_AUTO(TestBug12797); 108 TESTCASE_AUTO(TestBug12918); 109 TESTCASE_AUTO(TestBug12932); 110 TESTCASE_AUTO(TestEmoji); 111 TESTCASE_AUTO(TestBug12519); 112 TESTCASE_AUTO(TestBug12677); 113 TESTCASE_AUTO(TestTableRedundancies); 114 TESTCASE_AUTO(TestBug13447); 115 TESTCASE_AUTO(TestReverse); 116 TESTCASE_AUTO(TestBug13692); 117 TESTCASE_AUTO_END; 118 } 119 120 121 //-------------------------------------------------------------------------------------- 122 // 123 // RBBITest constructor and destructor 124 // 125 //-------------------------------------------------------------------------------------- 126 127 RBBITest::RBBITest() { 128 fTestParams = NULL; 129 } 130 131 132 RBBITest::~RBBITest() { 133 } 134 135 136 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) { 137 UErrorCode status = U_ZERO_ERROR; 138 char name[100]; 139 printf("code alpha extend alphanum type word sent line name\n"); 140 int nextExpectedIndex = 0; 141 utext_setNativeIndex(tstr, 0); 142 for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) { 143 if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) { 144 printf("------------------------------------------------ %d\n", j); 145 ++nextExpectedIndex; 146 } 147 148 UChar32 c = utext_next32(tstr); 149 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status); 150 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c, 151 u_isUAlphabetic(c), 152 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND), 153 u_isalnum(c), 154 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY, 155 u_charType(c), 156 U_SHORT_PROPERTY_NAME), 157 u_getPropertyValueName(UCHAR_WORD_BREAK, 158 u_getIntPropertyValue(c, 159 UCHAR_WORD_BREAK), 160 U_SHORT_PROPERTY_NAME), 161 u_getPropertyValueName(UCHAR_SENTENCE_BREAK, 162 u_getIntPropertyValue(c, 163 UCHAR_SENTENCE_BREAK), 164 U_SHORT_PROPERTY_NAME), 165 u_getPropertyValueName(UCHAR_LINE_BREAK, 166 u_getIntPropertyValue(c, 167 UCHAR_LINE_BREAK), 168 U_SHORT_PROPERTY_NAME), 169 name); 170 } 171 } 172 173 174 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) { 175 UErrorCode status = U_ZERO_ERROR; 176 UText *tstr = NULL; 177 tstr = utext_openConstUnicodeString(NULL, &ustr, &status); 178 if (U_FAILURE(status)) { 179 printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status)); 180 return; 181 } 182 printStringBreaks(tstr, expected, expectedCount); 183 utext_close(tstr); 184 } 185 186 187 void RBBITest::TestBug3818() { 188 UErrorCode status = U_ZERO_ERROR; 189 190 // Four Thai words... 191 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 192 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 }; 193 UnicodeString thaiStr(thaiWordData); 194 195 BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status); 196 if (U_FAILURE(status) || bi == NULL) { 197 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status)); 198 return; 199 } 200 bi->setText(thaiStr); 201 202 int32_t startOfSecondWord = bi->following(1); 203 if (startOfSecondWord != 4) { 204 errln("Fail at file %s, line %d expected start of word at 4, got %d", 205 __FILE__, __LINE__, startOfSecondWord); 206 } 207 startOfSecondWord = bi->following(0); 208 if (startOfSecondWord != 4) { 209 errln("Fail at file %s, line %d expected start of word at 4, got %d", 210 __FILE__, __LINE__, startOfSecondWord); 211 } 212 delete bi; 213 } 214 215 216 //--------------------------------------------- 217 // 218 // other tests 219 // 220 //--------------------------------------------- 221 222 void RBBITest::TestGetAvailableLocales() 223 { 224 int32_t locCount = 0; 225 const Locale* locList = BreakIterator::getAvailableLocales(locCount); 226 227 if (locCount == 0) 228 dataerrln("getAvailableLocales() returned an empty list!"); 229 // Just make sure that it's returning good memory. 230 int32_t i; 231 for (i = 0; i < locCount; ++i) { 232 logln(locList[i].getName()); 233 } 234 } 235 236 //Testing the BreakIterator::getDisplayName() function 237 void RBBITest::TestGetDisplayName() 238 { 239 UnicodeString result; 240 241 BreakIterator::getDisplayName(Locale::getUS(), result); 242 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)") 243 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \"" 244 + result); 245 246 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result); 247 if (result != "French (France)") 248 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \"" 249 + result); 250 } 251 /** 252 * Test End Behaviour 253 * @bug 4068137 254 */ 255 void RBBITest::TestEndBehaviour() 256 { 257 UErrorCode status = U_ZERO_ERROR; 258 UnicodeString testString("boo."); 259 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status); 260 if (U_FAILURE(status)) 261 { 262 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status)); 263 return; 264 } 265 wb->setText(testString); 266 267 if (wb->first() != 0) 268 errln("Didn't get break at beginning of string."); 269 if (wb->next() != 3) 270 errln("Didn't get break before period in \"boo.\""); 271 if (wb->current() != 4 && wb->next() != 4) 272 errln("Didn't get break at end of string."); 273 delete wb; 274 } 275 /* 276 * @bug 4153072 277 */ 278 void RBBITest::TestBug4153072() { 279 UErrorCode status = U_ZERO_ERROR; 280 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status); 281 if (U_FAILURE(status)) 282 { 283 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status)); 284 return; 285 } 286 UnicodeString str("...Hello, World!..."); 287 int32_t begin = 3; 288 int32_t end = str.length() - 3; 289 UBool onBoundary; 290 291 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin); 292 iter->adoptText(textIterator); 293 int index; 294 // Note: with the switch to UText, there is no way to restrict the 295 // iteration range to begin at an index other than zero. 296 // String character iterators created with a non-zero bound are 297 // treated by RBBI as being empty. 298 for (index = -1; index < begin + 1; ++index) { 299 onBoundary = iter->isBoundary(index); 300 if (index == 0? !onBoundary : onBoundary) { 301 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index + 302 " and begin index = " + begin); 303 } 304 } 305 delete iter; 306 } 307 308 309 // 310 // Test for problem reported by Ashok Matoria on 9 July 2007 311 // One.<kSoftHyphen><kSpace>Two. 312 // 313 // Sentence break at start (0) and then on calling next() it breaks at 314 // 'T' of "Two". Now, at this point if I do next() and 315 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two". 316 // 317 void RBBITest::TestBug5775() { 318 UErrorCode status = U_ZERO_ERROR; 319 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status); 320 TEST_ASSERT_SUCCESS(status); 321 if (U_FAILURE(status)) { 322 return; 323 } 324 // Check for status first for better handling of no data errors. 325 TEST_ASSERT(bi != NULL); 326 if (bi == NULL) { 327 return; 328 } 329 330 UnicodeString s("One.\\u00ad Two.", -1, US_INV); 331 // 01234 56789 332 s = s.unescape(); 333 bi->setText(s); 334 int pos = bi->next(); 335 TEST_ASSERT(pos == 6); 336 pos = bi->next(); 337 TEST_ASSERT(pos == 10); 338 pos = bi->previous(); 339 TEST_ASSERT(pos == 6); 340 delete bi; 341 } 342 343 344 345 //------------------------------------------------------------------------------ 346 // 347 // RBBITest::Extended Run RBBI Tests from an external test data file 348 // 349 //------------------------------------------------------------------------------ 350 351 struct TestParams { 352 BreakIterator *bi; // Break iterator is set while parsing test source. 353 // Changed out whenever test data changes break type. 354 355 UnicodeString dataToBreak; // Data that is built up while parsing the test. 356 UVector32 *expectedBreaks; // Expected break positions, matches dataToBreak UnicodeString. 357 UVector32 *srcLine; // Positions in source file, indexed same as dataToBreak. 358 UVector32 *srcCol; 359 360 UText *textToBreak; // UText, could be UTF8 or UTF16. 361 UVector32 *textMap; // Map from UTF-16 dataToBreak offsets to UText offsets. 362 CharString utf8String; // UTF-8 form of text to break. 363 364 TestParams(UErrorCode &status) : dataToBreak() { 365 bi = NULL; 366 expectedBreaks = new UVector32(status); 367 srcLine = new UVector32(status); 368 srcCol = new UVector32(status); 369 textToBreak = NULL; 370 textMap = new UVector32(status); 371 } 372 373 ~TestParams() { 374 delete bi; 375 delete expectedBreaks; 376 delete srcLine; 377 delete srcCol; 378 utext_close(textToBreak); 379 delete textMap; 380 } 381 382 int32_t getSrcLine(int32_t bp); 383 int32_t getExpectedBreak(int32_t bp); 384 int32_t getSrcCol(int32_t bp); 385 386 void setUTF16(UErrorCode &status); 387 void setUTF8(UErrorCode &status); 388 }; 389 390 // Append a UnicodeString to a CharString with UTF-8 encoding. 391 // Substitute any invalid chars. 392 // Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted. 393 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) { 394 if (U_FAILURE(status)) { 395 return; 396 } 397 int32_t utf8Length; 398 u_strToUTF8WithSub(NULL, 0, &utf8Length, // Output Buffer, NULL for preflight. 399 src.getBuffer(), src.length(), // UTF-16 data 400 0xfffd, NULL, // Substitution char, number of subs. 401 &status); 402 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) { 403 return; 404 } 405 status = U_ZERO_ERROR; 406 int32_t capacity; 407 char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status); 408 u_strToUTF8WithSub(buffer, utf8Length, NULL, 409 src.getBuffer(), src.length(), 410 0xfffd, NULL, &status); 411 dest.append(buffer, utf8Length, status); 412 } 413 414 415 void TestParams::setUTF16(UErrorCode &status) { 416 textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status); 417 textMap->removeAllElements(); 418 for (int32_t i=0; i<dataToBreak.length(); i++) { 419 if (i == dataToBreak.getChar32Start(i)) { 420 textMap->addElement(i, status); 421 } else { 422 textMap->addElement(-1, status); 423 } 424 } 425 textMap->addElement(dataToBreak.length(), status); 426 U_ASSERT(dataToBreak.length() + 1 == textMap->size()); 427 } 428 429 430 void TestParams::setUTF8(UErrorCode &status) { 431 if (U_FAILURE(status)) { 432 return; 433 } 434 utf8String.clear(); 435 CharStringAppend(utf8String, dataToBreak, status); 436 textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status); 437 if (U_FAILURE(status)) { 438 return; 439 } 440 441 textMap->removeAllElements(); 442 int32_t utf16Index = 0; 443 for (;;) { 444 textMap->addElement(utf16Index, status); 445 UChar32 c32 = utext_current32(textToBreak); 446 if (c32 < 0) { 447 break; 448 } 449 utf16Index += U16_LENGTH(c32); 450 utext_next32(textToBreak); 451 while (textMap->size() < utext_getNativeIndex(textToBreak)) { 452 textMap->addElement(-1, status); 453 } 454 } 455 U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size()); 456 } 457 458 459 int32_t TestParams::getSrcLine(int32_t bp) { 460 if (bp >= textMap->size()) { 461 bp = textMap->size() - 1; 462 } 463 int32_t i = 0; 464 for(; bp >= 0 ; --bp) { 465 // Move to a character boundary if we are not on one already. 466 i = textMap->elementAti(bp); 467 if (i >= 0) { 468 break; 469 } 470 } 471 return srcLine->elementAti(i); 472 } 473 474 475 int32_t TestParams::getExpectedBreak(int32_t bp) { 476 if (bp >= textMap->size()) { 477 return 0; 478 } 479 int32_t i = textMap->elementAti(bp); 480 int32_t retVal = 0; 481 if (i >= 0) { 482 retVal = expectedBreaks->elementAti(i); 483 } 484 return retVal; 485 } 486 487 488 int32_t TestParams::getSrcCol(int32_t bp) { 489 if (bp >= textMap->size()) { 490 bp = textMap->size() - 1; 491 } 492 int32_t i = 0; 493 for(; bp >= 0; --bp) { 494 // Move bp to a character boundary if we are not on one already. 495 i = textMap->elementAti(bp); 496 if (i >= 0) { 497 break; 498 } 499 } 500 return srcCol->elementAti(i); 501 } 502 503 504 void RBBITest::executeTest(TestParams *t, UErrorCode &status) { 505 int32_t bp; 506 int32_t prevBP; 507 int32_t i; 508 509 TEST_ASSERT_SUCCESS(status); 510 if (U_FAILURE(status)) { 511 return; 512 } 513 514 if (t->bi == NULL) { 515 return; 516 } 517 518 t->bi->setText(t->textToBreak, status); 519 // 520 // Run the iterator forward 521 // 522 prevBP = -1; 523 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) { 524 if (prevBP == bp) { 525 // Fail for lack of forward progress. 526 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d", 527 bp, t->getSrcLine(bp), t->getSrcCol(bp)); 528 break; 529 } 530 531 // Check that there we didn't miss an expected break between the last one 532 // and this one. 533 for (i=prevBP+1; i<bp; i++) { 534 if (t->getExpectedBreak(i) != 0) { 535 int expected[] = {0, i}; 536 printStringBreaks(t->dataToBreak, expected, 2); 537 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 538 i, t->getSrcLine(i), t->getSrcCol(i)); 539 } 540 } 541 542 // Check that the break we did find was expected 543 if (t->getExpectedBreak(bp) == 0) { 544 int expected[] = {0, bp}; 545 printStringBreaks(t->textToBreak, expected, 2); 546 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d", 547 bp, t->getSrcLine(bp), t->getSrcCol(bp)); 548 } else { 549 // The break was expected. 550 // Check that the {nnn} tag value is correct. 551 int32_t expectedTagVal = t->getExpectedBreak(bp); 552 if (expectedTagVal == -1) { 553 expectedTagVal = 0; 554 } 555 int32_t line = t->getSrcLine(bp); 556 int32_t rs = t->bi->getRuleStatus(); 557 if (rs != expectedTagVal) { 558 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n" 559 " Actual, Expected status = %4d, %4d", 560 bp, line, t->getSrcCol(bp), rs, expectedTagVal); 561 } 562 } 563 564 prevBP = bp; 565 } 566 567 // Verify that there were no missed expected breaks after the last one found 568 for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) { 569 if (t->getExpectedBreak(i) != 0) { 570 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 571 i, t->getSrcLine(i), t->getSrcCol(i)); 572 } 573 } 574 575 // 576 // Run the iterator backwards, verify that the same breaks are found. 577 // 578 prevBP = utext_nativeLength(t->textToBreak)+2; // start with a phony value for the last break pos seen. 579 bp = t->bi->last(); 580 while (bp != BreakIterator::DONE) { 581 if (prevBP == bp) { 582 // Fail for lack of progress. 583 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d", 584 bp, t->getSrcLine(bp), t->getSrcCol(bp)); 585 break; 586 } 587 588 // Check that we didn't miss an expected break between the last one 589 // and this one. (UVector returns zeros for index out of bounds.) 590 for (i=prevBP-1; i>bp; i--) { 591 if (t->getExpectedBreak(i) != 0) { 592 errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 593 i, t->getSrcLine(i), t->getSrcCol(i)); 594 } 595 } 596 597 // Check that the break we did find was expected 598 if (t->getExpectedBreak(bp) == 0) { 599 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d", 600 bp, t->getSrcLine(bp), t->getSrcCol(bp)); 601 } else { 602 // The break was expected. 603 // Check that the {nnn} tag value is correct. 604 int32_t expectedTagVal = t->getExpectedBreak(bp); 605 if (expectedTagVal == -1) { 606 expectedTagVal = 0; 607 } 608 int line = t->getSrcLine(bp); 609 int32_t rs = t->bi->getRuleStatus(); 610 if (rs != expectedTagVal) { 611 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n" 612 " Actual, Expected status = %4d, %4d", 613 bp, line, t->getSrcCol(bp), rs, expectedTagVal); 614 } 615 } 616 617 prevBP = bp; 618 bp = t->bi->previous(); 619 } 620 621 // Verify that there were no missed breaks prior to the last one found 622 for (i=prevBP-1; i>=0; i--) { 623 if (t->getExpectedBreak(i) != 0) { 624 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 625 i, t->getSrcLine(i), t->getSrcCol(i)); 626 } 627 } 628 629 // Check isBoundary() 630 for (i=0; i < utext_nativeLength(t->textToBreak); i++) { 631 UBool boundaryExpected = (t->getExpectedBreak(i) != 0); 632 UBool boundaryFound = t->bi->isBoundary(i); 633 if (boundaryExpected != boundaryFound) { 634 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n" 635 " Expected, Actual= %s, %s", 636 i, t->getSrcLine(i), t->getSrcCol(i), 637 boundaryExpected ? "true":"false", boundaryFound? "true" : "false"); 638 } 639 } 640 641 // Check following() 642 for (i=0; i < utext_nativeLength(t->textToBreak); i++) { 643 int32_t actualBreak = t->bi->following(i); 644 int32_t expectedBreak = BreakIterator::DONE; 645 for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) { 646 if (t->getExpectedBreak(j) != 0) { 647 expectedBreak = j; 648 break; 649 } 650 } 651 if (expectedBreak != actualBreak) { 652 errln("following(%d) incorrect. File line,col= %4d,%4d\n" 653 " Expected, Actual= %d, %d", 654 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak); 655 } 656 } 657 658 // Check preceding() 659 for (i=utext_nativeLength(t->textToBreak); i>=0; i--) { 660 int32_t actualBreak = t->bi->preceding(i); 661 int32_t expectedBreak = BreakIterator::DONE; 662 663 // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent. 664 // preceding(trailing byte) will return the index of some preceding code point, 665 // not the lead byte of the current code point, even though that has a smaller index. 666 // Therefore, start looking at the expected break data not at i-1, but at 667 // the start of code point index - 1. 668 utext_setNativeIndex(t->textToBreak, i); 669 int32_t j = utext_getNativeIndex(t->textToBreak) - 1; 670 for (; j >= 0; j--) { 671 if (t->getExpectedBreak(j) != 0) { 672 expectedBreak = j; 673 break; 674 } 675 } 676 if (expectedBreak != actualBreak) { 677 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n" 678 " Expected, Actual= %d, %d", 679 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak); 680 } 681 } 682 } 683 684 685 void RBBITest::TestExtended() { 686 // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This 687 // data driven test closely entangles filtered and regular data. 688 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION 689 UErrorCode status = U_ZERO_ERROR; 690 Locale locale(""); 691 692 TestParams tp(status); 693 694 RegexMatcher localeMatcher(UnicodeString(u"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status); 695 if (U_FAILURE(status)) { 696 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status)); 697 } 698 699 // 700 // Open and read the test data file. 701 // 702 const char *testDataDirectory = IntlTest::getSourceTestData(status); 703 CharString testFileName(testDataDirectory, -1, status); 704 testFileName.append("rbbitst.txt", -1, status); 705 706 int len; 707 UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status); 708 if (U_FAILURE(status)) { 709 errln("%s:%d Error %s opening file rbbitst.txt", __FILE__, __LINE__, u_errorName(status)); 710 return; 711 } 712 713 bool skipTest = false; // Skip this test? 714 715 // 716 // Put the test data into a UnicodeString 717 // 718 UnicodeString testString(FALSE, testFile, len); 719 720 enum EParseState{ 721 PARSE_COMMENT, 722 PARSE_TAG, 723 PARSE_DATA, 724 PARSE_NUM, 725 PARSE_RULES 726 } 727 parseState = PARSE_TAG; 728 729 EParseState savedState = PARSE_TAG; 730 731 int32_t lineNum = 1; 732 int32_t colStart = 0; 733 int32_t column = 0; 734 int32_t charIdx = 0; 735 736 int32_t tagValue = 0; // The numeric value of a <nnn> tag. 737 738 UnicodeString rules; // Holds rules from a <rules> ... </rules> block 739 int32_t rulesFirstLine; // Line number of the start of current <rules> block 740 741 for (charIdx = 0; charIdx < len; ) { 742 status = U_ZERO_ERROR; 743 UChar c = testString.charAt(charIdx); 744 charIdx++; 745 if (c == u'\r' && charIdx<len && testString.charAt(charIdx) == u'\n') { 746 // treat CRLF as a unit 747 c = u'\n'; 748 charIdx++; 749 } 750 if (c == u'\n' || c == u'\r') { 751 lineNum++; 752 colStart = charIdx; 753 } 754 column = charIdx - colStart + 1; 755 756 switch (parseState) { 757 case PARSE_COMMENT: 758 if (c == u'\n' || c == u'\r') { 759 parseState = savedState; 760 } 761 break; 762 763 case PARSE_TAG: 764 { 765 if (c == u'#') { 766 parseState = PARSE_COMMENT; 767 savedState = PARSE_TAG; 768 break; 769 } 770 if (u_isUWhiteSpace(c)) { 771 break; 772 } 773 if (testString.compare(charIdx-1, 6, u"<word>") == 0) { 774 delete tp.bi; 775 tp.bi = BreakIterator::createWordInstance(locale, status); 776 skipTest = false; 777 charIdx += 5; 778 break; 779 } 780 if (testString.compare(charIdx-1, 6, u"<char>") == 0) { 781 delete tp.bi; 782 tp.bi = BreakIterator::createCharacterInstance(locale, status); 783 skipTest = false; 784 charIdx += 5; 785 break; 786 } 787 if (testString.compare(charIdx-1, 6, u"<line>") == 0) { 788 delete tp.bi; 789 tp.bi = BreakIterator::createLineInstance(locale, status); 790 skipTest = false; 791 charIdx += 5; 792 break; 793 } 794 if (testString.compare(charIdx-1, 6, u"<sent>") == 0) { 795 delete tp.bi; 796 tp.bi = BreakIterator::createSentenceInstance(locale, status); 797 skipTest = false; 798 charIdx += 5; 799 break; 800 } 801 if (testString.compare(charIdx-1, 7, u"<title>") == 0) { 802 delete tp.bi; 803 tp.bi = BreakIterator::createTitleInstance(locale, status); 804 charIdx += 6; 805 break; 806 } 807 808 if (testString.compare(charIdx-1, 7, u"<rules>") == 0 || 809 testString.compare(charIdx-1, 10, u"<badrules>") == 0) { 810 charIdx = testString.indexOf(u'>', charIdx) + 1; 811 parseState = PARSE_RULES; 812 rules.remove(); 813 rulesFirstLine = lineNum; 814 break; 815 } 816 817 // <locale loc_name> 818 localeMatcher.reset(testString); 819 if (localeMatcher.lookingAt(charIdx-1, status)) { 820 UnicodeString localeName = localeMatcher.group(1, status); 821 char localeName8[100]; 822 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0); 823 locale = Locale::createFromName(localeName8); 824 charIdx += localeMatcher.group(0, status).length() - 1; 825 TEST_ASSERT_SUCCESS(status); 826 break; 827 } 828 if (testString.compare(charIdx-1, 6, u"<data>") == 0) { 829 parseState = PARSE_DATA; 830 charIdx += 5; 831 tp.dataToBreak = ""; 832 tp.expectedBreaks->removeAllElements(); 833 tp.srcCol ->removeAllElements(); 834 tp.srcLine->removeAllElements(); 835 break; 836 } 837 838 errln("line %d: Tag expected in test file.", lineNum); 839 parseState = PARSE_COMMENT; 840 savedState = PARSE_DATA; 841 goto end_test; // Stop the test. 842 } 843 break; 844 845 case PARSE_RULES: 846 if (testString.compare(charIdx-1, 8, u"</rules>") == 0) { 847 charIdx += 7; 848 parseState = PARSE_TAG; 849 delete tp.bi; 850 UParseError pe; 851 tp.bi = new RuleBasedBreakIterator(rules, pe, status); 852 skipTest = U_FAILURE(status); 853 if (U_FAILURE(status)) { 854 errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.", 855 rulesFirstLine + pe.line - 1, u_errorName(status)); 856 } 857 } else if (testString.compare(charIdx-1, 11, u"</badrules>") == 0) { 858 charIdx += 10; 859 parseState = PARSE_TAG; 860 UErrorCode ec = U_ZERO_ERROR; 861 UParseError pe; 862 RuleBasedBreakIterator bi(rules, pe, ec); 863 if (U_SUCCESS(ec)) { 864 errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.", 865 rulesFirstLine + pe.line - 1); 866 } 867 } else { 868 rules.append(c); 869 } 870 break; 871 872 case PARSE_DATA: 873 if (c == u'') { 874 int32_t breakIdx = tp.dataToBreak.length(); 875 tp.expectedBreaks->setSize(breakIdx+1); 876 tp.expectedBreaks->setElementAt(-1, breakIdx); 877 tp.srcLine->setSize(breakIdx+1); 878 tp.srcLine->setElementAt(lineNum, breakIdx); 879 tp.srcCol ->setSize(breakIdx+1); 880 tp.srcCol ->setElementAt(column, breakIdx); 881 break; 882 } 883 884 if (testString.compare(charIdx-1, 7, u"</data>") == 0) { 885 // Add final entry to mappings from break location to source file position. 886 // Need one extra because last break position returned is after the 887 // last char in the data, not at the last char. 888 tp.srcLine->addElement(lineNum, status); 889 tp.srcCol ->addElement(column, status); 890 891 parseState = PARSE_TAG; 892 charIdx += 6; 893 894 if (!skipTest) { 895 // RUN THE TEST! 896 status = U_ZERO_ERROR; 897 tp.setUTF16(status); 898 executeTest(&tp, status); 899 TEST_ASSERT_SUCCESS(status); 900 901 // Run again, this time with UTF-8 text wrapped in a UText. 902 status = U_ZERO_ERROR; 903 tp.setUTF8(status); 904 TEST_ASSERT_SUCCESS(status); 905 executeTest(&tp, status); 906 } 907 break; 908 } 909 910 if (testString.compare(charIdx-1, 3, u"\\N{") == 0) { 911 // Named character, e.g. \N{COMBINING GRAVE ACCENT} 912 // Get the code point from the name and insert it into the test data. 913 // (Damn, no API takes names in Unicode !!! 914 // we've got to take it back to char *) 915 int32_t nameEndIdx = testString.indexOf(u'}', charIdx); 916 int32_t nameLength = nameEndIdx - (charIdx+2); 917 char charNameBuf[200]; 918 UChar32 theChar = -1; 919 if (nameEndIdx != -1) { 920 UErrorCode status = U_ZERO_ERROR; 921 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf)); 922 charNameBuf[sizeof(charNameBuf)-1] = 0; 923 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status); 924 if (U_FAILURE(status)) { 925 theChar = -1; 926 } 927 } 928 if (theChar == -1) { 929 errln("Error in named character in test file at line %d, col %d", 930 lineNum, column); 931 } else { 932 // Named code point was recognized. Insert it 933 // into the test data. 934 tp.dataToBreak.append(theChar); 935 while (tp.dataToBreak.length() > tp.srcLine->size()) { 936 tp.srcLine->addElement(lineNum, status); 937 tp.srcCol ->addElement(column, status); 938 } 939 } 940 if (nameEndIdx > charIdx) { 941 charIdx = nameEndIdx+1; 942 943 } 944 break; 945 } 946 947 948 949 if (testString.compare(charIdx-1, 2, u"<>") == 0) { 950 charIdx++; 951 int32_t breakIdx = tp.dataToBreak.length(); 952 tp.expectedBreaks->setSize(breakIdx+1); 953 tp.expectedBreaks->setElementAt(-1, breakIdx); 954 tp.srcLine->setSize(breakIdx+1); 955 tp.srcLine->setElementAt(lineNum, breakIdx); 956 tp.srcCol ->setSize(breakIdx+1); 957 tp.srcCol ->setElementAt(column, breakIdx); 958 break; 959 } 960 961 if (c == u'<') { 962 tagValue = 0; 963 parseState = PARSE_NUM; 964 break; 965 } 966 967 if (c == u'#' && column==3) { // TODO: why is column off so far? 968 parseState = PARSE_COMMENT; 969 savedState = PARSE_DATA; 970 break; 971 } 972 973 if (c == u'\\') { 974 // Check for \ at end of line, a line continuation. 975 // Advance over (discard) the newline 976 UChar32 cp = testString.char32At(charIdx); 977 if (cp == u'\r' && charIdx<len && testString.charAt(charIdx+1) == u'\n') { 978 // We have a CR LF 979 // Need an extra increment of the input ptr to move over both of them 980 charIdx++; 981 } 982 if (cp == u'\n' || cp == u'\r') { 983 lineNum++; 984 colStart = charIdx; 985 charIdx++; 986 break; 987 } 988 989 // Let unescape handle the back slash. 990 cp = testString.unescapeAt(charIdx); 991 if (cp != -1) { 992 // Escape sequence was recognized. Insert the char 993 // into the test data. 994 tp.dataToBreak.append(cp); 995 while (tp.dataToBreak.length() > tp.srcLine->size()) { 996 tp.srcLine->addElement(lineNum, status); 997 tp.srcCol ->addElement(column, status); 998 } 999 break; 1000 } 1001 1002 1003 // Not a recognized backslash escape sequence. 1004 // Take the next char as a literal. 1005 // TODO: Should this be an error? 1006 c = testString.charAt(charIdx); 1007 charIdx = testString.moveIndex32(charIdx, 1); 1008 } 1009 1010 // Normal, non-escaped data char. 1011 tp.dataToBreak.append(c); 1012 1013 // Save the mapping from offset in the data to line/column numbers in 1014 // the original input file. Will be used for better error messages only. 1015 // If there's an expected break before this char, the slot in the mapping 1016 // vector will already be set for this char; don't overwrite it. 1017 if (tp.dataToBreak.length() > tp.srcLine->size()) { 1018 tp.srcLine->addElement(lineNum, status); 1019 tp.srcCol ->addElement(column, status); 1020 } 1021 break; 1022 1023 1024 case PARSE_NUM: 1025 // We are parsing an expected numeric tag value, like <1234>, 1026 // within a chunk of data. 1027 if (u_isUWhiteSpace(c)) { 1028 break; 1029 } 1030 1031 if (c == u'>') { 1032 // Finished the number. Add the info to the expected break data, 1033 // and switch parse state back to doing plain data. 1034 parseState = PARSE_DATA; 1035 if (tagValue == 0) { 1036 tagValue = -1; 1037 } 1038 int32_t breakIdx = tp.dataToBreak.length(); 1039 tp.expectedBreaks->setSize(breakIdx+1); 1040 tp.expectedBreaks->setElementAt(tagValue, breakIdx); 1041 tp.srcLine->setSize(breakIdx+1); 1042 tp.srcLine->setElementAt(lineNum, breakIdx); 1043 tp.srcCol ->setSize(breakIdx+1); 1044 tp.srcCol ->setElementAt(column, breakIdx); 1045 break; 1046 } 1047 1048 if (u_isdigit(c)) { 1049 tagValue = tagValue*10 + u_charDigitValue(c); 1050 break; 1051 } 1052 1053 errln("Syntax Error in test file at line %d, col %d", 1054 lineNum, column); 1055 parseState = PARSE_COMMENT; 1056 goto end_test; // Stop the test 1057 break; 1058 } 1059 1060 1061 if (U_FAILURE(status)) { 1062 dataerrln("ICU Error %s while parsing test file at line %d.", 1063 u_errorName(status), lineNum); 1064 status = U_ZERO_ERROR; 1065 goto end_test; // Stop the test 1066 } 1067 1068 } 1069 1070 // Reached end of test file. Raise an error if parseState indicates that we are 1071 // within a block that should have been terminated. 1072 1073 if (parseState == PARSE_RULES) { 1074 errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.", 1075 lineNum, rulesFirstLine); 1076 } 1077 if (parseState == PARSE_DATA) { 1078 errln("rbbitst.txt:%d <data> block not closed.", lineNum); 1079 } 1080 1081 1082 end_test: 1083 delete [] testFile; 1084 #endif 1085 } 1086 1087 1088 //------------------------------------------------------------------------------- 1089 // 1090 // TestDictRules create a break iterator from source rules that includes a 1091 // dictionary range. Regression for bug #7130. Source rules 1092 // do not declare a break iterator type (word, line, sentence, etc. 1093 // but the dictionary code, without a type, would loop. 1094 // 1095 //------------------------------------------------------------------------------- 1096 void RBBITest::TestDictRules() { 1097 const char *rules = "$dictionary = [a-z]; \n" 1098 "!!forward; \n" 1099 "$dictionary $dictionary; \n" 1100 "!!reverse; \n" 1101 "$dictionary $dictionary; \n"; 1102 const char *text = "aa"; 1103 UErrorCode status = U_ZERO_ERROR; 1104 UParseError parseError; 1105 1106 RuleBasedBreakIterator bi(rules, parseError, status); 1107 if (U_SUCCESS(status)) { 1108 UnicodeString utext = text; 1109 bi.setText(utext); 1110 int32_t position; 1111 int32_t loops; 1112 for (loops = 0; loops<10; loops++) { 1113 position = bi.next(); 1114 if (position == RuleBasedBreakIterator::DONE) { 1115 break; 1116 } 1117 } 1118 TEST_ASSERT(loops == 1); 1119 } else { 1120 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status)); 1121 } 1122 } 1123 1124 1125 1126 //------------------------------------------------------------------------------- 1127 // 1128 // ReadAndConvertFile Read a text data file, convert it to UChars, and 1129 // return the data in one big UChar * buffer, which the caller must delete. 1130 // 1131 // parameters: 1132 // fileName: the name of the file, with no directory part. The test data directory 1133 // is assumed. 1134 // ulen an out parameter, receives the actual length (in UChars) of the file data. 1135 // encoding The file encoding. If the file contains a BOM, that will override the encoding 1136 // specified here. The BOM, if it exists, will be stripped from the returned data. 1137 // Pass NULL for the system default encoding. 1138 // status 1139 // returns: 1140 // The file data, converted to UChar. 1141 // The caller must delete this when done with 1142 // delete [] theBuffer; 1143 // 1144 // TODO: This is a clone of RegexTest::ReadAndConvertFile. 1145 // Move this function to some common place. 1146 // 1147 //-------------------------------------------------------------------------------- 1148 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) { 1149 UChar *retPtr = NULL; 1150 char *fileBuf = NULL; 1151 UConverter* conv = NULL; 1152 FILE *f = NULL; 1153 1154 ulen = 0; 1155 if (U_FAILURE(status)) { 1156 return retPtr; 1157 } 1158 1159 // 1160 // Open the file. 1161 // 1162 f = fopen(fileName, "rb"); 1163 if (f == 0) { 1164 dataerrln("Error opening test data file %s\n", fileName); 1165 status = U_FILE_ACCESS_ERROR; 1166 return NULL; 1167 } 1168 // 1169 // Read it in 1170 // 1171 int fileSize; 1172 int amt_read; 1173 1174 fseek( f, 0, SEEK_END); 1175 fileSize = ftell(f); 1176 fileBuf = new char[fileSize]; 1177 fseek(f, 0, SEEK_SET); 1178 amt_read = fread(fileBuf, 1, fileSize, f); 1179 if (amt_read != fileSize || fileSize <= 0) { 1180 errln("Error reading test data file."); 1181 goto cleanUpAndReturn; 1182 } 1183 1184 // 1185 // Look for a Unicode Signature (BOM) on the data just read 1186 // 1187 int32_t signatureLength; 1188 const char * fileBufC; 1189 const char* bomEncoding; 1190 1191 fileBufC = fileBuf; 1192 bomEncoding = ucnv_detectUnicodeSignature( 1193 fileBuf, fileSize, &signatureLength, &status); 1194 if(bomEncoding!=NULL ){ 1195 fileBufC += signatureLength; 1196 fileSize -= signatureLength; 1197 encoding = bomEncoding; 1198 } 1199 1200 // 1201 // Open a converter to take the rule file to UTF-16 1202 // 1203 conv = ucnv_open(encoding, &status); 1204 if (U_FAILURE(status)) { 1205 goto cleanUpAndReturn; 1206 } 1207 1208 // 1209 // Convert the rules to UChar. 1210 // Preflight first to determine required buffer size. 1211 // 1212 ulen = ucnv_toUChars(conv, 1213 NULL, // dest, 1214 0, // destCapacity, 1215 fileBufC, 1216 fileSize, 1217 &status); 1218 if (status == U_BUFFER_OVERFLOW_ERROR) { 1219 // Buffer Overflow is expected from the preflight operation. 1220 status = U_ZERO_ERROR; 1221 1222 retPtr = new UChar[ulen+1]; 1223 ucnv_toUChars(conv, 1224 retPtr, // dest, 1225 ulen+1, 1226 fileBufC, 1227 fileSize, 1228 &status); 1229 } 1230 1231 cleanUpAndReturn: 1232 fclose(f); 1233 delete []fileBuf; 1234 ucnv_close(conv); 1235 if (U_FAILURE(status)) { 1236 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 1237 delete []retPtr; 1238 retPtr = 0; 1239 ulen = 0; 1240 }; 1241 return retPtr; 1242 } 1243 1244 1245 1246 //-------------------------------------------------------------------------------------------- 1247 // 1248 // Run tests from each of the boundary test data files distributed by the Unicode Consortium 1249 // 1250 //------------------------------------------------------------------------------------------- 1251 void RBBITest::TestUnicodeFiles() { 1252 RuleBasedBreakIterator *bi; 1253 UErrorCode status = U_ZERO_ERROR; 1254 1255 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status); 1256 TEST_ASSERT_SUCCESS(status); 1257 if (U_SUCCESS(status)) { 1258 runUnicodeTestData("GraphemeBreakTest.txt", bi); 1259 } 1260 delete bi; 1261 1262 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status); 1263 TEST_ASSERT_SUCCESS(status); 1264 if (U_SUCCESS(status)) { 1265 runUnicodeTestData("WordBreakTest.txt", bi); 1266 } 1267 delete bi; 1268 1269 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status); 1270 TEST_ASSERT_SUCCESS(status); 1271 if (U_SUCCESS(status)) { 1272 runUnicodeTestData("SentenceBreakTest.txt", bi); 1273 } 1274 delete bi; 1275 1276 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status); 1277 TEST_ASSERT_SUCCESS(status); 1278 if (U_SUCCESS(status)) { 1279 runUnicodeTestData("LineBreakTest.txt", bi); 1280 } 1281 delete bi; 1282 } 1283 1284 1285 // Check for test cases from the Unicode test data files that are known to fail 1286 // and should be skipped as known issues because ICU does not fully implement 1287 // the Unicode specifications, or because ICU includes tailorings that differ from 1288 // the Unicode standard. 1289 // 1290 // Test cases are identified by the test data sequence, which tends to be more stable 1291 // across Unicode versions than the test file line numbers. 1292 // 1293 // The test case with ticket "10666" is a dummy, included as an example. 1294 1295 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) { 1296 static struct TestCase { 1297 const char *fTicketNum; 1298 const char *fFileName; 1299 const UChar *fString; 1300 } badTestCases[] = { 1301 {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"}, // Fake example, for illustration. 1302 // Issue 8151, move the Finnish tailoring of the line break of hyphens to root. 1303 // This probably ultimately wants to be resolved by updating UAX-14, but in the mean time 1304 // ICU is out of sync with Unicode. 1305 {"8151", "LineBreakTest.txt", u"-#"}, 1306 {"8151", "LineBreakTest.txt", u"\u002d\u0308\u0023"}, 1307 {"8151", "LineBreakTest.txt", u"\u002d\u00a7"}, 1308 {"8151", "LineBreakTest.txt", u"\u002d\u0308\u00a7"}, 1309 {"8151", "LineBreakTest.txt", u"\u002d\U00050005"}, 1310 {"8151", "LineBreakTest.txt", u"\u002d\u0308\U00050005"}, 1311 {"8151", "LineBreakTest.txt", u"\u002d\u0e01"}, 1312 {"8151", "LineBreakTest.txt", u"\u002d\u0308\u0e01"}, 1313 }; 1314 1315 for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) { 1316 const TestCase &badCase = badTestCases[n]; 1317 if (!strcmp(fileName, badCase.fFileName) && 1318 testCase == UnicodeString(badCase.fString)) { 1319 return logKnownIssue(badCase.fTicketNum); 1320 } 1321 } 1322 return FALSE; 1323 } 1324 1325 1326 //-------------------------------------------------------------------------------------------- 1327 // 1328 // Run tests from one of the boundary test data files distributed by the Unicode Consortium 1329 // 1330 //------------------------------------------------------------------------------------------- 1331 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) { 1332 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 1333 UErrorCode status = U_ZERO_ERROR; 1334 1335 // 1336 // Open and read the test data file, put it into a UnicodeString. 1337 // 1338 const char *testDataDirectory = IntlTest::getSourceTestData(status); 1339 char testFileName[1000]; 1340 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) { 1341 dataerrln("Can't open test data. Path too long."); 1342 return; 1343 } 1344 strcpy(testFileName, testDataDirectory); 1345 strcat(testFileName, fileName); 1346 1347 logln("Opening data file %s\n", fileName); 1348 1349 int len; 1350 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status); 1351 if (status != U_FILE_ACCESS_ERROR) { 1352 TEST_ASSERT_SUCCESS(status); 1353 TEST_ASSERT(testFile != NULL); 1354 } 1355 if (U_FAILURE(status) || testFile == NULL) { 1356 return; /* something went wrong, error already output */ 1357 } 1358 UnicodeString testFileAsString(TRUE, testFile, len); 1359 1360 // 1361 // Parse the test data file using a regular expression. 1362 // Each kind of token is recognized in its own capture group; what type of item was scanned 1363 // is identified by which group had a match. 1364 // 1365 // Caputure Group # 1 2 3 4 5 1366 // Parses this item: divide x hex digits comment \n unrecognized \n 1367 // 1368 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV); 1369 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status); 1370 UnicodeString testString; 1371 UVector32 breakPositions(status); 1372 int lineNumber = 1; 1373 TEST_ASSERT_SUCCESS(status); 1374 if (U_FAILURE(status)) { 1375 return; 1376 } 1377 1378 // 1379 // Scan through each test case, building up the string to be broken in testString, 1380 // and the positions that should be boundaries in the breakPositions vector. 1381 // 1382 int spin = 0; 1383 while (tokenMatcher.find()) { 1384 if(tokenMatcher.hitEnd()) { 1385 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for. 1386 This occurred when the text file was corrupt (wasn't marked as UTF-8) 1387 and caused an infinite loop here on EBCDIC systems! 1388 */ 1389 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin); 1390 // return; 1391 } 1392 if (tokenMatcher.start(1, status) >= 0) { 1393 // Scanned a divide sign, indicating a break position in the test data. 1394 if (testString.length()>0) { 1395 breakPositions.addElement(testString.length(), status); 1396 } 1397 } 1398 else if (tokenMatcher.start(2, status) >= 0) { 1399 // Scanned an 'x', meaning no break at this position in the test data 1400 // Nothing to be done here. 1401 } 1402 else if (tokenMatcher.start(3, status) >= 0) { 1403 // Scanned Hex digits. Convert them to binary, append to the character data string. 1404 const UnicodeString &hexNumber = tokenMatcher.group(3, status); 1405 int length = hexNumber.length(); 1406 if (length<=8) { 1407 char buf[10]; 1408 hexNumber.extract (0, length, buf, sizeof(buf), US_INV); 1409 UChar32 c = (UChar32)strtol(buf, NULL, 16); 1410 if (c<=0x10ffff) { 1411 testString.append(c); 1412 } else { 1413 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n", 1414 fileName, lineNumber); 1415 } 1416 } else { 1417 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n", 1418 fileName, lineNumber); 1419 } 1420 } 1421 else if (tokenMatcher.start(4, status) >= 0) { 1422 // Scanned to end of a line, possibly skipping over a comment in the process. 1423 // If the line from the file contained test data, run the test now. 1424 if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) { 1425 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi); 1426 } 1427 1428 // Clear out this test case. 1429 // The string and breakPositions vector will be refilled as the next 1430 // test case is parsed. 1431 testString.remove(); 1432 breakPositions.removeAllElements(); 1433 lineNumber++; 1434 } else { 1435 // Scanner catchall. Something unrecognized appeared on the line. 1436 char token[16]; 1437 UnicodeString uToken = tokenMatcher.group(0, status); 1438 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token)); 1439 token[sizeof(token)-1] = 0; 1440 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token); 1441 1442 // Clean up, in preparation for continuing with the next line. 1443 testString.remove(); 1444 breakPositions.removeAllElements(); 1445 lineNumber++; 1446 } 1447 TEST_ASSERT_SUCCESS(status); 1448 if (U_FAILURE(status)) { 1449 break; 1450 } 1451 } 1452 1453 delete [] testFile; 1454 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 1455 } 1456 1457 //-------------------------------------------------------------------------------------------- 1458 // 1459 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium 1460 // test data files. Do only a simple, forward-only check - 1461 // this test is mostly to check that ICU and the Unicode 1462 // data agree with each other. 1463 // 1464 //-------------------------------------------------------------------------------------------- 1465 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber, 1466 const UnicodeString &testString, // Text data to be broken 1467 UVector32 *breakPositions, // Positions where breaks should be found. 1468 RuleBasedBreakIterator *bi) { 1469 int32_t pos; // Break Position in the test string 1470 int32_t expectedI = 0; // Index of expected break position in the vector of expected results. 1471 int32_t expectedPos; // Expected break position (index into test string) 1472 1473 bi->setText(testString); 1474 pos = bi->first(); 1475 pos = bi->next(); 1476 1477 while (pos != BreakIterator::DONE) { 1478 if (expectedI >= breakPositions->size()) { 1479 errln("Test file \"%s\", line %d, unexpected break found at position %d", 1480 testFileName, lineNumber, pos); 1481 break; 1482 } 1483 expectedPos = breakPositions->elementAti(expectedI); 1484 if (pos < expectedPos) { 1485 errln("Test file \"%s\", line %d, unexpected break found at position %d", 1486 testFileName, lineNumber, pos); 1487 break; 1488 } 1489 if (pos > expectedPos) { 1490 errln("Test file \"%s\", line %d, failed to find expected break at position %d", 1491 testFileName, lineNumber, expectedPos); 1492 break; 1493 } 1494 pos = bi->next(); 1495 expectedI++; 1496 } 1497 1498 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) { 1499 errln("Test file \"%s\", line %d, failed to find expected break at position %d", 1500 testFileName, lineNumber, breakPositions->elementAti(expectedI)); 1501 } 1502 } 1503 1504 1505 1506 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 1507 //--------------------------------------------------------------------------------------- 1508 // 1509 // classs RBBIMonkeyKind 1510 // 1511 // Monkey Test for Break Iteration 1512 // Abstract interface class. Concrete derived classes independently 1513 // implement the break rules for different iterator types. 1514 // 1515 // The Monkey Test itself uses doesn't know which type of break iterator it is 1516 // testing, but works purely in terms of the interface defined here. 1517 // 1518 //--------------------------------------------------------------------------------------- 1519 class RBBIMonkeyKind { 1520 public: 1521 // Return a UVector of UnicodeSets, representing the character classes used 1522 // for this type of iterator. 1523 virtual UVector *charClasses() = 0; 1524 1525 // Set the test text on which subsequent calls to next() will operate 1526 virtual void setText(const UnicodeString &s) = 0; 1527 1528 // Find the next break postion, starting from the prev break position, or from zero. 1529 // Return -1 after reaching end of string. 1530 virtual int32_t next(int32_t i) = 0; 1531 1532 virtual ~RBBIMonkeyKind(); 1533 UErrorCode deferredStatus; 1534 1535 1536 protected: 1537 RBBIMonkeyKind(); 1538 1539 private: 1540 }; 1541 1542 RBBIMonkeyKind::RBBIMonkeyKind() { 1543 deferredStatus = U_ZERO_ERROR; 1544 } 1545 1546 RBBIMonkeyKind::~RBBIMonkeyKind() { 1547 } 1548 1549 1550 //---------------------------------------------------------------------------------------- 1551 // 1552 // Random Numbers. Similar to standard lib rand() and srand() 1553 // Not using library to 1554 // 1. Get same results on all platforms. 1555 // 2. Get access to current seed, to more easily reproduce failures. 1556 // 1557 //--------------------------------------------------------------------------------------- 1558 static uint32_t m_seed = 1; 1559 1560 static uint32_t m_rand() 1561 { 1562 m_seed = m_seed * 1103515245 + 12345; 1563 return (uint32_t)(m_seed/65536) % 32768; 1564 } 1565 1566 1567 //------------------------------------------------------------------------------------------ 1568 // 1569 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation 1570 // of RBBIMonkeyKind. 1571 // 1572 //------------------------------------------------------------------------------------------ 1573 class RBBICharMonkey: public RBBIMonkeyKind { 1574 public: 1575 RBBICharMonkey(); 1576 virtual ~RBBICharMonkey(); 1577 virtual UVector *charClasses(); 1578 virtual void setText(const UnicodeString &s); 1579 virtual int32_t next(int32_t i); 1580 private: 1581 UVector *fSets; 1582 1583 UnicodeSet *fCRLFSet; 1584 UnicodeSet *fControlSet; 1585 UnicodeSet *fExtendSet; 1586 UnicodeSet *fZWJSet; 1587 UnicodeSet *fRegionalIndicatorSet; 1588 UnicodeSet *fPrependSet; 1589 UnicodeSet *fSpacingSet; 1590 UnicodeSet *fLSet; 1591 UnicodeSet *fVSet; 1592 UnicodeSet *fTSet; 1593 UnicodeSet *fLVSet; 1594 UnicodeSet *fLVTSet; 1595 UnicodeSet *fHangulSet; 1596 UnicodeSet *fExtendedPictSet; 1597 UnicodeSet *fAnySet; 1598 1599 const UnicodeString *fText; 1600 }; 1601 1602 1603 RBBICharMonkey::RBBICharMonkey() { 1604 UErrorCode status = U_ZERO_ERROR; 1605 1606 fText = NULL; 1607 1608 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status); 1609 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status); 1610 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status); 1611 fZWJSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status); 1612 fRegionalIndicatorSet = 1613 new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status); 1614 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status); 1615 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status); 1616 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status); 1617 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status); 1618 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status); 1619 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status); 1620 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status); 1621 fHangulSet = new UnicodeSet(); 1622 fHangulSet->addAll(*fLSet); 1623 fHangulSet->addAll(*fVSet); 1624 fHangulSet->addAll(*fTSet); 1625 fHangulSet->addAll(*fLVSet); 1626 fHangulSet->addAll(*fLVTSet); 1627 1628 fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status); 1629 fAnySet = new UnicodeSet(0, 0x10ffff); 1630 1631 fSets = new UVector(status); 1632 fSets->addElement(fCRLFSet, status); 1633 fSets->addElement(fControlSet, status); 1634 fSets->addElement(fExtendSet, status); 1635 fSets->addElement(fRegionalIndicatorSet, status); 1636 if (!fPrependSet->isEmpty()) { 1637 fSets->addElement(fPrependSet, status); 1638 } 1639 fSets->addElement(fSpacingSet, status); 1640 fSets->addElement(fHangulSet, status); 1641 fSets->addElement(fAnySet, status); 1642 fSets->addElement(fZWJSet, status); 1643 fSets->addElement(fExtendedPictSet, status); 1644 if (U_FAILURE(status)) { 1645 deferredStatus = status; 1646 } 1647 } 1648 1649 1650 void RBBICharMonkey::setText(const UnicodeString &s) { 1651 fText = &s; 1652 } 1653 1654 1655 1656 int32_t RBBICharMonkey::next(int32_t prevPos) { 1657 int p0, p1, p2, p3; // Indices of the significant code points around the 1658 // break position being tested. The candidate break 1659 // location is before p2. 1660 1661 int breakPos = -1; 1662 1663 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 1664 UChar32 cBase; // for (X Extend*) patterns, the X character. 1665 1666 if (U_FAILURE(deferredStatus)) { 1667 return -1; 1668 } 1669 1670 // Previous break at end of string. return DONE. 1671 if (prevPos >= fText->length()) { 1672 return -1; 1673 } 1674 p0 = p1 = p2 = p3 = prevPos; 1675 c3 = fText->char32At(prevPos); 1676 c0 = c1 = c2 = cBase = 0; 1677 (void)p0; // suppress set but not used warning. 1678 (void)c0; 1679 1680 // Loop runs once per "significant" character position in the input text. 1681 for (;;) { 1682 // Move all of the positions forward in the input string. 1683 p0 = p1; c0 = c1; 1684 p1 = p2; c1 = c2; 1685 p2 = p3; c2 = c3; 1686 1687 // Advancd p3 by one codepoint 1688 p3 = fText->moveIndex32(p3, 1); 1689 c3 = fText->char32At(p3); 1690 1691 if (p1 == p2) { 1692 // Still warming up the loop. (won't work with zero length strings, but we don't care) 1693 continue; 1694 } 1695 if (p2 == fText->length()) { 1696 // Reached end of string. Always a break position. 1697 break; 1698 } 1699 1700 // Rule GB3 CR x LF 1701 // No Extend or Format characters may appear between the CR and LF, 1702 // which requires the additional check for p2 immediately following p1. 1703 // 1704 if (c1==0x0D && c2==0x0A && p1==(p2-1)) { 1705 continue; 1706 } 1707 1708 // Rule (GB4). ( Control | CR | LF ) <break> 1709 if (fControlSet->contains(c1) || 1710 c1 == 0x0D || 1711 c1 == 0x0A) { 1712 break; 1713 } 1714 1715 // Rule (GB5) <break> ( Control | CR | LF ) 1716 // 1717 if (fControlSet->contains(c2) || 1718 c2 == 0x0D || 1719 c2 == 0x0A) { 1720 break; 1721 } 1722 1723 1724 // Rule (GB6) L x ( L | V | LV | LVT ) 1725 if (fLSet->contains(c1) && 1726 (fLSet->contains(c2) || 1727 fVSet->contains(c2) || 1728 fLVSet->contains(c2) || 1729 fLVTSet->contains(c2))) { 1730 continue; 1731 } 1732 1733 // Rule (GB7) ( LV | V ) x ( V | T ) 1734 if ((fLVSet->contains(c1) || fVSet->contains(c1)) && 1735 (fVSet->contains(c2) || fTSet->contains(c2))) { 1736 continue; 1737 } 1738 1739 // Rule (GB8) ( LVT | T) x T 1740 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) && 1741 fTSet->contains(c2)) { 1742 continue; 1743 } 1744 1745 // Rule (GB9) x (Extend | ZWJ) 1746 if (fExtendSet->contains(c2) || fZWJSet->contains(c2)) { 1747 if (!fExtendSet->contains(c1)) { 1748 cBase = c1; 1749 } 1750 continue; 1751 } 1752 1753 // Rule (GB9a) x SpacingMark 1754 if (fSpacingSet->contains(c2)) { 1755 continue; 1756 } 1757 1758 // Rule (GB9b) Prepend x 1759 if (fPrependSet->contains(c1)) { 1760 continue; 1761 } 1762 1763 // Rule (GB11) Extended_Pictographic Extend * ZWJ x Extended_Pictographic 1764 if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) { 1765 continue; 1766 } 1767 1768 // Rule (GB12-13) Regional_Indicator x Regional_Indicator 1769 // Note: The first if condition is a little tricky. We only need to force 1770 // a break if there are three or more contiguous RIs. If there are 1771 // only two, a break following will occur via other rules, and will include 1772 // any trailing extend characters, which is needed behavior. 1773 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1) 1774 && fRegionalIndicatorSet->contains(c2)) { 1775 break; 1776 } 1777 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) { 1778 continue; 1779 } 1780 1781 // Rule (GB999) Any <break> Any 1782 break; 1783 } 1784 1785 breakPos = p2; 1786 return breakPos; 1787 } 1788 1789 1790 1791 UVector *RBBICharMonkey::charClasses() { 1792 return fSets; 1793 } 1794 1795 1796 RBBICharMonkey::~RBBICharMonkey() { 1797 delete fSets; 1798 delete fCRLFSet; 1799 delete fControlSet; 1800 delete fExtendSet; 1801 delete fRegionalIndicatorSet; 1802 delete fPrependSet; 1803 delete fSpacingSet; 1804 delete fLSet; 1805 delete fVSet; 1806 delete fTSet; 1807 delete fLVSet; 1808 delete fLVTSet; 1809 delete fHangulSet; 1810 delete fAnySet; 1811 delete fZWJSet; 1812 delete fExtendedPictSet; 1813 } 1814 1815 //------------------------------------------------------------------------------------------ 1816 // 1817 // class RBBIWordMonkey Word Break specific implementation 1818 // of RBBIMonkeyKind. 1819 // 1820 //------------------------------------------------------------------------------------------ 1821 class RBBIWordMonkey: public RBBIMonkeyKind { 1822 public: 1823 RBBIWordMonkey(); 1824 virtual ~RBBIWordMonkey(); 1825 virtual UVector *charClasses(); 1826 virtual void setText(const UnicodeString &s); 1827 virtual int32_t next(int32_t i); 1828 private: 1829 UVector *fSets; 1830 1831 UnicodeSet *fCRSet; 1832 UnicodeSet *fLFSet; 1833 UnicodeSet *fNewlineSet; 1834 UnicodeSet *fRegionalIndicatorSet; 1835 UnicodeSet *fKatakanaSet; 1836 UnicodeSet *fHebrew_LetterSet; 1837 UnicodeSet *fALetterSet; 1838 UnicodeSet *fSingle_QuoteSet; 1839 UnicodeSet *fDouble_QuoteSet; 1840 UnicodeSet *fMidNumLetSet; 1841 UnicodeSet *fMidLetterSet; 1842 UnicodeSet *fMidNumSet; 1843 UnicodeSet *fNumericSet; 1844 UnicodeSet *fFormatSet; 1845 UnicodeSet *fOtherSet; 1846 UnicodeSet *fExtendSet; 1847 UnicodeSet *fExtendNumLetSet; 1848 UnicodeSet *fWSegSpaceSet; 1849 UnicodeSet *fDictionarySet; 1850 UnicodeSet *fZWJSet; 1851 UnicodeSet *fExtendedPictSet; 1852 1853 const UnicodeString *fText; 1854 }; 1855 1856 1857 RBBIWordMonkey::RBBIWordMonkey() 1858 { 1859 UErrorCode status = U_ZERO_ERROR; 1860 1861 fSets = new UVector(status); 1862 1863 fCRSet = new UnicodeSet(u"[\\p{Word_Break = CR}]", status); 1864 fLFSet = new UnicodeSet(u"[\\p{Word_Break = LF}]", status); 1865 fNewlineSet = new UnicodeSet(u"[\\p{Word_Break = Newline}]", status); 1866 fKatakanaSet = new UnicodeSet(u"[\\p{Word_Break = Katakana}]", status); 1867 fRegionalIndicatorSet = new UnicodeSet(u"[\\p{Word_Break = Regional_Indicator}]", status); 1868 fHebrew_LetterSet = new UnicodeSet(u"[\\p{Word_Break = Hebrew_Letter}]", status); 1869 fALetterSet = new UnicodeSet(u"[\\p{Word_Break = ALetter}]", status); 1870 fSingle_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Single_Quote}]", status); 1871 fDouble_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Double_Quote}]", status); 1872 fMidNumLetSet = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]", status); 1873 fMidLetterSet = new UnicodeSet(u"[\\p{Word_Break = MidLetter}]", status); 1874 fMidNumSet = new UnicodeSet(u"[\\p{Word_Break = MidNum}]", status); 1875 fNumericSet = new UnicodeSet(u"[\\p{Word_Break = Numeric}]", status); 1876 fFormatSet = new UnicodeSet(u"[\\p{Word_Break = Format}]", status); 1877 fExtendNumLetSet = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status); 1878 fExtendSet = new UnicodeSet(u"[\\p{Word_Break = Extend}]", status); 1879 fWSegSpaceSet = new UnicodeSet(u"[\\p{Word_Break = WSegSpace}]", status); 1880 1881 fZWJSet = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]", status); 1882 fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status); 1883 1884 fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status); 1885 fDictionarySet->addAll(*fKatakanaSet); 1886 fDictionarySet->addAll(UnicodeSet(u"[\\p{LineBreak = Complex_Context}]", status)); 1887 1888 fALetterSet->removeAll(*fDictionarySet); 1889 1890 fOtherSet = new UnicodeSet(); 1891 if(U_FAILURE(status)) { 1892 IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status)); 1893 deferredStatus = status; 1894 return; 1895 } 1896 1897 fOtherSet->complement(); 1898 fOtherSet->removeAll(*fCRSet); 1899 fOtherSet->removeAll(*fLFSet); 1900 fOtherSet->removeAll(*fNewlineSet); 1901 fOtherSet->removeAll(*fKatakanaSet); 1902 fOtherSet->removeAll(*fHebrew_LetterSet); 1903 fOtherSet->removeAll(*fALetterSet); 1904 fOtherSet->removeAll(*fSingle_QuoteSet); 1905 fOtherSet->removeAll(*fDouble_QuoteSet); 1906 fOtherSet->removeAll(*fMidLetterSet); 1907 fOtherSet->removeAll(*fMidNumSet); 1908 fOtherSet->removeAll(*fNumericSet); 1909 fOtherSet->removeAll(*fExtendNumLetSet); 1910 fOtherSet->removeAll(*fWSegSpaceSet); 1911 fOtherSet->removeAll(*fFormatSet); 1912 fOtherSet->removeAll(*fExtendSet); 1913 fOtherSet->removeAll(*fRegionalIndicatorSet); 1914 fOtherSet->removeAll(*fZWJSet); 1915 fOtherSet->removeAll(*fExtendedPictSet); 1916 1917 // Inhibit dictionary characters from being tested at all. 1918 fOtherSet->removeAll(*fDictionarySet); 1919 1920 fSets->addElement(fCRSet, status); 1921 fSets->addElement(fLFSet, status); 1922 fSets->addElement(fNewlineSet, status); 1923 fSets->addElement(fRegionalIndicatorSet, status); 1924 fSets->addElement(fHebrew_LetterSet, status); 1925 fSets->addElement(fALetterSet, status); 1926 fSets->addElement(fSingle_QuoteSet, status); 1927 fSets->addElement(fDouble_QuoteSet, status); 1928 //fSets->addElement(fKatakanaSet, status); // Omit Katakana from fSets, which omits Katakana characters 1929 // from the test data. They are all in the dictionary set, 1930 // which this (old, to be retired) monkey test cannot handle. 1931 fSets->addElement(fMidLetterSet, status); 1932 fSets->addElement(fMidNumLetSet, status); 1933 fSets->addElement(fMidNumSet, status); 1934 fSets->addElement(fNumericSet, status); 1935 fSets->addElement(fFormatSet, status); 1936 fSets->addElement(fExtendSet, status); 1937 fSets->addElement(fOtherSet, status); 1938 fSets->addElement(fExtendNumLetSet, status); 1939 fSets->addElement(fWSegSpaceSet, status); 1940 1941 fSets->addElement(fZWJSet, status); 1942 fSets->addElement(fExtendedPictSet, status); 1943 1944 if (U_FAILURE(status)) { 1945 deferredStatus = status; 1946 } 1947 } 1948 1949 void RBBIWordMonkey::setText(const UnicodeString &s) { 1950 fText = &s; 1951 } 1952 1953 1954 int32_t RBBIWordMonkey::next(int32_t prevPos) { 1955 int p0, p1, p2, p3; // Indices of the significant code points around the 1956 // break position being tested. The candidate break 1957 // location is before p2. 1958 1959 int breakPos = -1; 1960 1961 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 1962 1963 if (U_FAILURE(deferredStatus)) { 1964 return -1; 1965 } 1966 1967 // Prev break at end of string. return DONE. 1968 if (prevPos >= fText->length()) { 1969 return -1; 1970 } 1971 p0 = p1 = p2 = p3 = prevPos; 1972 c3 = fText->char32At(prevPos); 1973 c0 = c1 = c2 = 0; 1974 (void)p0; // Suppress set but not used warning. 1975 1976 // Loop runs once per "significant" character position in the input text. 1977 for (;;) { 1978 // Move all of the positions forward in the input string. 1979 p0 = p1; c0 = c1; 1980 p1 = p2; c1 = c2; 1981 p2 = p3; c2 = c3; 1982 1983 // Advancd p3 by X(Extend | Format)* Rule 4 1984 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change) 1985 do { 1986 p3 = fText->moveIndex32(p3, 1); 1987 c3 = fText->char32At(p3); 1988 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { 1989 break; 1990 }; 1991 } 1992 while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3)); 1993 1994 1995 if (p1 == p2) { 1996 // Still warming up the loop. (won't work with zero length strings, but we don't care) 1997 continue; 1998 } 1999 if (p2 == fText->length()) { 2000 // Reached end of string. Always a break position. 2001 break; 2002 } 2003 2004 // Rule (3) CR x LF 2005 // No Extend or Format characters may appear between the CR and LF, 2006 // which requires the additional check for p2 immediately following p1. 2007 // 2008 if (c1==0x0D && c2==0x0A) { 2009 continue; 2010 } 2011 2012 // Rule (3a) Break before and after newlines (including CR and LF) 2013 // 2014 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) { 2015 break; 2016 }; 2017 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { 2018 break; 2019 }; 2020 2021 // Rule (3c) ZWJ x Extended_Pictographic 2022 // Not ignoring extend chars, so peek into input text to 2023 // get the potential ZWJ, the character immediately preceding c2. 2024 // Sloppy UChar32 indexing: p2-1 may reference trail half 2025 // but char32At will get the full code point. 2026 if (fZWJSet->contains(fText->char32At(p2-1)) && fExtendedPictSet->contains(c2)) { 2027 continue; 2028 } 2029 2030 // Rule (3d) Keep horizontal whitespace together. 2031 if (fWSegSpaceSet->contains(fText->char32At(p2-1)) && fWSegSpaceSet->contains(c2)) { 2032 continue; 2033 } 2034 2035 // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter) 2036 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) && 2037 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) { 2038 continue; 2039 } 2040 2041 // Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter) 2042 // 2043 if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) && 2044 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) && 2045 (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) { 2046 continue; 2047 } 2048 2049 // Rule (7) (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter) 2050 if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) && 2051 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) && 2052 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) { 2053 continue; 2054 } 2055 2056 // Rule (7a) Hebrew_Letter x Single_Quote 2057 if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) { 2058 continue; 2059 } 2060 2061 // Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter 2062 if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) { 2063 continue; 2064 } 2065 2066 // Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter 2067 if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) { 2068 continue; 2069 } 2070 2071 // Rule (8) Numeric x Numeric 2072 if (fNumericSet->contains(c1) && 2073 fNumericSet->contains(c2)) { 2074 continue; 2075 } 2076 2077 // Rule (9) (ALetter | Hebrew_Letter) x Numeric 2078 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) && 2079 fNumericSet->contains(c2)) { 2080 continue; 2081 } 2082 2083 // Rule (10) Numeric x (ALetter | Hebrew_Letter) 2084 if (fNumericSet->contains(c1) && 2085 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) { 2086 continue; 2087 } 2088 2089 // Rule (11) Numeric (MidNum | MidNumLet | Single_Quote) x Numeric 2090 if (fNumericSet->contains(c0) && 2091 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) && 2092 fNumericSet->contains(c2)) { 2093 continue; 2094 } 2095 2096 // Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric 2097 if (fNumericSet->contains(c1) && 2098 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) && 2099 fNumericSet->contains(c3)) { 2100 continue; 2101 } 2102 2103 // Rule (13) Katakana x Katakana 2104 // Note: matches UAX 29 rules, but doesn't come into play for ICU because 2105 // all Katakana are handled by the dictionary breaker. 2106 if (fKatakanaSet->contains(c1) && 2107 fKatakanaSet->contains(c2)) { 2108 continue; 2109 } 2110 2111 // Rule 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet 2112 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) || 2113 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) && 2114 fExtendNumLetSet->contains(c2)) { 2115 continue; 2116 } 2117 2118 // Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana) 2119 if (fExtendNumLetSet->contains(c1) && 2120 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) || 2121 fNumericSet->contains(c2) || fKatakanaSet->contains(c2))) { 2122 continue; 2123 } 2124 2125 // Rule 15 - 17 Group pairs of Regional Indicators. 2126 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) { 2127 break; 2128 } 2129 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) { 2130 continue; 2131 } 2132 2133 // Rule 999. Break found here. 2134 break; 2135 } 2136 2137 breakPos = p2; 2138 return breakPos; 2139 } 2140 2141 2142 UVector *RBBIWordMonkey::charClasses() { 2143 return fSets; 2144 } 2145 2146 2147 RBBIWordMonkey::~RBBIWordMonkey() { 2148 delete fSets; 2149 delete fCRSet; 2150 delete fLFSet; 2151 delete fNewlineSet; 2152 delete fKatakanaSet; 2153 delete fHebrew_LetterSet; 2154 delete fALetterSet; 2155 delete fSingle_QuoteSet; 2156 delete fDouble_QuoteSet; 2157 delete fMidNumLetSet; 2158 delete fMidLetterSet; 2159 delete fMidNumSet; 2160 delete fNumericSet; 2161 delete fFormatSet; 2162 delete fExtendSet; 2163 delete fExtendNumLetSet; 2164 delete fWSegSpaceSet; 2165 delete fRegionalIndicatorSet; 2166 delete fDictionarySet; 2167 delete fOtherSet; 2168 delete fZWJSet; 2169 delete fExtendedPictSet; 2170 } 2171 2172 2173 2174 2175 //------------------------------------------------------------------------------------------ 2176 // 2177 // class RBBISentMonkey Sentence Break specific implementation 2178 // of RBBIMonkeyKind. 2179 // 2180 //------------------------------------------------------------------------------------------ 2181 class RBBISentMonkey: public RBBIMonkeyKind { 2182 public: 2183 RBBISentMonkey(); 2184 virtual ~RBBISentMonkey(); 2185 virtual UVector *charClasses(); 2186 virtual void setText(const UnicodeString &s); 2187 virtual int32_t next(int32_t i); 2188 private: 2189 int moveBack(int posFrom); 2190 int moveForward(int posFrom); 2191 UChar32 cAt(int pos); 2192 2193 UVector *fSets; 2194 2195 UnicodeSet *fSepSet; 2196 UnicodeSet *fFormatSet; 2197 UnicodeSet *fSpSet; 2198 UnicodeSet *fLowerSet; 2199 UnicodeSet *fUpperSet; 2200 UnicodeSet *fOLetterSet; 2201 UnicodeSet *fNumericSet; 2202 UnicodeSet *fATermSet; 2203 UnicodeSet *fSContinueSet; 2204 UnicodeSet *fSTermSet; 2205 UnicodeSet *fCloseSet; 2206 UnicodeSet *fOtherSet; 2207 UnicodeSet *fExtendSet; 2208 2209 const UnicodeString *fText; 2210 2211 }; 2212 2213 RBBISentMonkey::RBBISentMonkey() 2214 { 2215 UErrorCode status = U_ZERO_ERROR; 2216 2217 fSets = new UVector(status); 2218 2219 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator 2220 // set and made into character classes of their own. For the monkey impl, 2221 // they remain in SEP, since Sep always appears with CR and LF in the rules. 2222 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status); 2223 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status); 2224 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status); 2225 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status); 2226 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status); 2227 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status); 2228 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status); 2229 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status); 2230 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status); 2231 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status); 2232 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status); 2233 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status); 2234 fOtherSet = new UnicodeSet(); 2235 2236 if(U_FAILURE(status)) { 2237 deferredStatus = status; 2238 return; 2239 } 2240 2241 fOtherSet->complement(); 2242 fOtherSet->removeAll(*fSepSet); 2243 fOtherSet->removeAll(*fFormatSet); 2244 fOtherSet->removeAll(*fSpSet); 2245 fOtherSet->removeAll(*fLowerSet); 2246 fOtherSet->removeAll(*fUpperSet); 2247 fOtherSet->removeAll(*fOLetterSet); 2248 fOtherSet->removeAll(*fNumericSet); 2249 fOtherSet->removeAll(*fATermSet); 2250 fOtherSet->removeAll(*fSContinueSet); 2251 fOtherSet->removeAll(*fSTermSet); 2252 fOtherSet->removeAll(*fCloseSet); 2253 fOtherSet->removeAll(*fExtendSet); 2254 2255 fSets->addElement(fSepSet, status); 2256 fSets->addElement(fFormatSet, status); 2257 fSets->addElement(fSpSet, status); 2258 fSets->addElement(fLowerSet, status); 2259 fSets->addElement(fUpperSet, status); 2260 fSets->addElement(fOLetterSet, status); 2261 fSets->addElement(fNumericSet, status); 2262 fSets->addElement(fATermSet, status); 2263 fSets->addElement(fSContinueSet, status); 2264 fSets->addElement(fSTermSet, status); 2265 fSets->addElement(fCloseSet, status); 2266 fSets->addElement(fOtherSet, status); 2267 fSets->addElement(fExtendSet, status); 2268 2269 if (U_FAILURE(status)) { 2270 deferredStatus = status; 2271 } 2272 } 2273 2274 2275 2276 void RBBISentMonkey::setText(const UnicodeString &s) { 2277 fText = &s; 2278 } 2279 2280 UVector *RBBISentMonkey::charClasses() { 2281 return fSets; 2282 } 2283 2284 2285 // moveBack() Find the "significant" code point preceding the index i. 2286 // Skips over ($Extend | $Format)* . 2287 // 2288 int RBBISentMonkey::moveBack(int i) { 2289 if (i <= 0) { 2290 return -1; 2291 } 2292 UChar32 c; 2293 int32_t j = i; 2294 do { 2295 j = fText->moveIndex32(j, -1); 2296 c = fText->char32At(j); 2297 } 2298 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c))); 2299 return j; 2300 2301 } 2302 2303 2304 int RBBISentMonkey::moveForward(int i) { 2305 if (i>=fText->length()) { 2306 return fText->length(); 2307 } 2308 UChar32 c; 2309 int32_t j = i; 2310 do { 2311 j = fText->moveIndex32(j, 1); 2312 c = cAt(j); 2313 } 2314 while (fFormatSet->contains(c) || fExtendSet->contains(c)); 2315 return j; 2316 } 2317 2318 UChar32 RBBISentMonkey::cAt(int pos) { 2319 if (pos<0 || pos>=fText->length()) { 2320 return -1; 2321 } else { 2322 return fText->char32At(pos); 2323 } 2324 } 2325 2326 int32_t RBBISentMonkey::next(int32_t prevPos) { 2327 int p0, p1, p2, p3; // Indices of the significant code points around the 2328 // break position being tested. The candidate break 2329 // location is before p2. 2330 2331 int breakPos = -1; 2332 2333 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 2334 UChar32 c; 2335 2336 if (U_FAILURE(deferredStatus)) { 2337 return -1; 2338 } 2339 2340 // Prev break at end of string. return DONE. 2341 if (prevPos >= fText->length()) { 2342 return -1; 2343 } 2344 p0 = p1 = p2 = p3 = prevPos; 2345 c3 = fText->char32At(prevPos); 2346 c0 = c1 = c2 = 0; 2347 (void)p0; // Suppress set but not used warning. 2348 2349 // Loop runs once per "significant" character position in the input text. 2350 for (;;) { 2351 // Move all of the positions forward in the input string. 2352 p0 = p1; c0 = c1; 2353 p1 = p2; c1 = c2; 2354 p2 = p3; c2 = c3; 2355 2356 // Advancd p3 by X(Extend | Format)* Rule 4 2357 p3 = moveForward(p3); 2358 c3 = cAt(p3); 2359 2360 // Rule (3) CR x LF 2361 if (c1==0x0d && c2==0x0a && p2==(p1+1)) { 2362 continue; 2363 } 2364 2365 // Rule (4). Sep <break> 2366 if (fSepSet->contains(c1)) { 2367 p2 = p1+1; // Separators don't combine with Extend or Format. 2368 break; 2369 } 2370 2371 if (p2 >= fText->length()) { 2372 // Reached end of string. Always a break position. 2373 break; 2374 } 2375 2376 if (p2 == prevPos) { 2377 // Still warming up the loop. (won't work with zero length strings, but we don't care) 2378 continue; 2379 } 2380 2381 // Rule (6). ATerm x Numeric 2382 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) { 2383 continue; 2384 } 2385 2386 // Rule (7). (Upper | Lower) ATerm x Uppper 2387 if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) && 2388 fATermSet->contains(c1) && fUpperSet->contains(c2)) { 2389 continue; 2390 } 2391 2392 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower 2393 // Note: STerm | ATerm are added to the negated part of the expression by a 2394 // note to the Unicode 5.0 documents. 2395 int p8 = p1; 2396 while (fSpSet->contains(cAt(p8))) { 2397 p8 = moveBack(p8); 2398 } 2399 while (fCloseSet->contains(cAt(p8))) { 2400 p8 = moveBack(p8); 2401 } 2402 if (fATermSet->contains(cAt(p8))) { 2403 p8=p2; 2404 for (;;) { 2405 c = cAt(p8); 2406 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) || 2407 fLowerSet->contains(c) || fSepSet->contains(c) || 2408 fATermSet->contains(c) || fSTermSet->contains(c)) { 2409 break; 2410 } 2411 p8 = moveForward(p8); 2412 } 2413 if (fLowerSet->contains(cAt(p8))) { 2414 continue; 2415 } 2416 } 2417 2418 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm); 2419 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) { 2420 p8 = p1; 2421 while (fSpSet->contains(cAt(p8))) { 2422 p8 = moveBack(p8); 2423 } 2424 while (fCloseSet->contains(cAt(p8))) { 2425 p8 = moveBack(p8); 2426 } 2427 c = cAt(p8); 2428 if (fSTermSet->contains(c) || fATermSet->contains(c)) { 2429 continue; 2430 } 2431 } 2432 2433 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF) 2434 int p9 = p1; 2435 while (fCloseSet->contains(cAt(p9))) { 2436 p9 = moveBack(p9); 2437 } 2438 c = cAt(p9); 2439 if ((fSTermSet->contains(c) || fATermSet->contains(c))) { 2440 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) { 2441 continue; 2442 } 2443 } 2444 2445 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF) 2446 int p10 = p1; 2447 while (fSpSet->contains(cAt(p10))) { 2448 p10 = moveBack(p10); 2449 } 2450 while (fCloseSet->contains(cAt(p10))) { 2451 p10 = moveBack(p10); 2452 } 2453 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) { 2454 if (fSpSet->contains(c2) || fSepSet->contains(c2)) { 2455 continue; 2456 } 2457 } 2458 2459 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break> 2460 int p11 = p1; 2461 if (fSepSet->contains(cAt(p11))) { 2462 p11 = moveBack(p11); 2463 } 2464 while (fSpSet->contains(cAt(p11))) { 2465 p11 = moveBack(p11); 2466 } 2467 while (fCloseSet->contains(cAt(p11))) { 2468 p11 = moveBack(p11); 2469 } 2470 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) { 2471 break; 2472 } 2473 2474 // Rule (12) Any x Any 2475 continue; 2476 } 2477 breakPos = p2; 2478 return breakPos; 2479 } 2480 2481 RBBISentMonkey::~RBBISentMonkey() { 2482 delete fSets; 2483 delete fSepSet; 2484 delete fFormatSet; 2485 delete fSpSet; 2486 delete fLowerSet; 2487 delete fUpperSet; 2488 delete fOLetterSet; 2489 delete fNumericSet; 2490 delete fATermSet; 2491 delete fSContinueSet; 2492 delete fSTermSet; 2493 delete fCloseSet; 2494 delete fOtherSet; 2495 delete fExtendSet; 2496 } 2497 2498 2499 2500 //------------------------------------------------------------------------------------------- 2501 // 2502 // RBBILineMonkey 2503 // 2504 //------------------------------------------------------------------------------------------- 2505 2506 class RBBILineMonkey: public RBBIMonkeyKind { 2507 public: 2508 RBBILineMonkey(); 2509 virtual ~RBBILineMonkey(); 2510 virtual UVector *charClasses(); 2511 virtual void setText(const UnicodeString &s); 2512 virtual int32_t next(int32_t i); 2513 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar); 2514 private: 2515 UVector *fSets; 2516 2517 UnicodeSet *fBK; 2518 UnicodeSet *fCR; 2519 UnicodeSet *fLF; 2520 UnicodeSet *fCM; 2521 UnicodeSet *fNL; 2522 UnicodeSet *fSG; 2523 UnicodeSet *fWJ; 2524 UnicodeSet *fZW; 2525 UnicodeSet *fGL; 2526 UnicodeSet *fCB; 2527 UnicodeSet *fSP; 2528 UnicodeSet *fB2; 2529 UnicodeSet *fBA; 2530 UnicodeSet *fBB; 2531 UnicodeSet *fHH; 2532 UnicodeSet *fHY; 2533 UnicodeSet *fH2; 2534 UnicodeSet *fH3; 2535 UnicodeSet *fCL; 2536 UnicodeSet *fCP; 2537 UnicodeSet *fEX; 2538 UnicodeSet *fIN; 2539 UnicodeSet *fJL; 2540 UnicodeSet *fJV; 2541 UnicodeSet *fJT; 2542 UnicodeSet *fNS; 2543 UnicodeSet *fOP; 2544 UnicodeSet *fQU; 2545 UnicodeSet *fIS; 2546 UnicodeSet *fNU; 2547 UnicodeSet *fPO; 2548 UnicodeSet *fPR; 2549 UnicodeSet *fSY; 2550 UnicodeSet *fAI; 2551 UnicodeSet *fAL; 2552 UnicodeSet *fCJ; 2553 UnicodeSet *fHL; 2554 UnicodeSet *fID; 2555 UnicodeSet *fRI; 2556 UnicodeSet *fXX; 2557 UnicodeSet *fEB; 2558 UnicodeSet *fEM; 2559 UnicodeSet *fZWJ; 2560 2561 BreakIterator *fCharBI; 2562 const UnicodeString *fText; 2563 RegexMatcher *fNumberMatcher; 2564 }; 2565 2566 RBBILineMonkey::RBBILineMonkey() : 2567 RBBIMonkeyKind(), 2568 fSets(NULL), 2569 2570 fCharBI(NULL), 2571 fText(NULL), 2572 fNumberMatcher(NULL) 2573 2574 { 2575 if (U_FAILURE(deferredStatus)) { 2576 return; 2577 } 2578 2579 UErrorCode status = U_ZERO_ERROR; 2580 2581 fSets = new UVector(status); 2582 2583 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status); 2584 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status); 2585 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status); 2586 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status); 2587 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status); 2588 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status); 2589 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status); 2590 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status); 2591 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status); 2592 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status); 2593 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status); 2594 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status); 2595 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status); 2596 fHH = new UnicodeSet(); 2597 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status); 2598 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status); 2599 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status); 2600 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status); 2601 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status); 2602 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status); 2603 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status); 2604 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status); 2605 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status); 2606 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status); 2607 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status); 2608 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status); 2609 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status); 2610 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status); 2611 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status); 2612 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status); 2613 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status); 2614 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status); 2615 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status); 2616 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status); 2617 fCJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status); 2618 fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status); 2619 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status); 2620 fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status); 2621 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status); 2622 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status); 2623 fEB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EB}]"), status); 2624 fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status); 2625 fZWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status); 2626 2627 if (U_FAILURE(status)) { 2628 deferredStatus = status; 2629 return; 2630 } 2631 2632 fAL->addAll(*fXX); // Default behavior for XX is identical to AL 2633 fAL->addAll(*fAI); // Default behavior for AI is identical to AL 2634 fAL->addAll(*fSG); // Default behavior for SG is identical to AL. 2635 2636 fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS. 2637 fCM->addAll(*fZWJ); // ZWJ behaves as a CM. 2638 2639 fHH->add(u'\u2010'); // Hyphen, '' 2640 2641 fSets->addElement(fBK, status); 2642 fSets->addElement(fCR, status); 2643 fSets->addElement(fLF, status); 2644 fSets->addElement(fCM, status); 2645 fSets->addElement(fNL, status); 2646 fSets->addElement(fWJ, status); 2647 fSets->addElement(fZW, status); 2648 fSets->addElement(fGL, status); 2649 fSets->addElement(fCB, status); 2650 fSets->addElement(fSP, status); 2651 fSets->addElement(fB2, status); 2652 fSets->addElement(fBA, status); 2653 fSets->addElement(fBB, status); 2654 fSets->addElement(fHY, status); 2655 fSets->addElement(fH2, status); 2656 fSets->addElement(fH3, status); 2657 fSets->addElement(fCL, status); 2658 fSets->addElement(fCP, status); 2659 fSets->addElement(fEX, status); 2660 fSets->addElement(fIN, status); 2661 fSets->addElement(fJL, status); 2662 fSets->addElement(fJT, status); 2663 fSets->addElement(fJV, status); 2664 fSets->addElement(fNS, status); 2665 fSets->addElement(fOP, status); 2666 fSets->addElement(fQU, status); 2667 fSets->addElement(fIS, status); 2668 fSets->addElement(fNU, status); 2669 fSets->addElement(fPO, status); 2670 fSets->addElement(fPR, status); 2671 fSets->addElement(fSY, status); 2672 fSets->addElement(fAI, status); 2673 fSets->addElement(fAL, status); 2674 fSets->addElement(fHL, status); 2675 fSets->addElement(fID, status); 2676 fSets->addElement(fWJ, status); 2677 fSets->addElement(fRI, status); 2678 fSets->addElement(fSG, status); 2679 fSets->addElement(fEB, status); 2680 fSets->addElement(fEM, status); 2681 fSets->addElement(fZWJ, status); 2682 2683 2684 const char *rules = 2685 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?" 2686 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?" 2687 "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*" 2688 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*" 2689 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?" 2690 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"; 2691 2692 fNumberMatcher = new RegexMatcher( 2693 UnicodeString(rules, -1, US_INV), 0, status); 2694 2695 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status); 2696 2697 if (U_FAILURE(status)) { 2698 deferredStatus = status; 2699 } 2700 } 2701 2702 2703 void RBBILineMonkey::setText(const UnicodeString &s) { 2704 fText = &s; 2705 fCharBI->setText(s); 2706 fNumberMatcher->reset(s); 2707 } 2708 2709 // 2710 // rule9Adjust 2711 // Line Break TR rules 9 and 10 implementation. 2712 // This deals with combining marks and other sequences that 2713 // that must be treated as if they were something other than what they actually are. 2714 // 2715 // This is factored out into a separate function because it must be applied twice for 2716 // each potential break, once to the chars before the position being checked, then 2717 // again to the text following the possible break. 2718 // 2719 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) { 2720 if (pos == -1) { 2721 // Invalid initial position. Happens during the warmup iteration of the 2722 // main loop in next(). 2723 return; 2724 } 2725 2726 int32_t nPos = *nextPos; 2727 2728 // LB 9 Keep combining sequences together. 2729 // advance over any CM class chars. Note that Line Break CM is different 2730 // from the normal Grapheme Extend property. 2731 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d || 2732 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) { 2733 for (;;) { 2734 *nextChar = fText->char32At(nPos); 2735 if (!fCM->contains(*nextChar)) { 2736 break; 2737 } 2738 nPos = fText->moveIndex32(nPos, 1); 2739 } 2740 } 2741 2742 2743 // LB 9 Treat X CM* as if it were x. 2744 // No explicit action required. 2745 2746 // LB 10 Treat any remaining combining mark as AL 2747 if (fCM->contains(*posChar)) { 2748 *posChar = u'A'; 2749 } 2750 2751 // Push the updated nextPos and nextChar back to our caller. 2752 // This only makes a difference if posChar got bigger by consuming a 2753 // combining sequence. 2754 *nextPos = nPos; 2755 *nextChar = fText->char32At(nPos); 2756 } 2757 2758 2759 2760 int32_t RBBILineMonkey::next(int32_t startPos) { 2761 UErrorCode status = U_ZERO_ERROR; 2762 int32_t pos; // Index of the char following a potential break position 2763 UChar32 thisChar; // Character at above position "pos" 2764 2765 int32_t prevPos; // Index of the char preceding a potential break position 2766 UChar32 prevChar; // Character at above position. Note that prevChar 2767 // and thisChar may not be adjacent because combining 2768 // characters between them will be ignored. 2769 2770 int32_t prevPosX2; // Second previous character. Wider context for LB21a. 2771 UChar32 prevCharX2; 2772 2773 int32_t nextPos; // Index of the next character following pos. 2774 // Usually skips over combining marks. 2775 int32_t nextCPPos; // Index of the code point following "pos." 2776 // May point to a combining mark. 2777 int32_t tPos; // temp value. 2778 UChar32 c; 2779 2780 if (U_FAILURE(deferredStatus)) { 2781 return -1; 2782 } 2783 2784 if (startPos >= fText->length()) { 2785 return -1; 2786 } 2787 2788 2789 // Initial values for loop. Loop will run the first time without finding breaks, 2790 // while the invalid values shift out and the "this" and 2791 // "prev" positions are filled in with good values. 2792 pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration. 2793 thisChar = prevChar = prevCharX2 = 0; 2794 nextPos = nextCPPos = startPos; 2795 2796 2797 // Loop runs once per position in the test text, until a break position 2798 // is found. 2799 for (;;) { 2800 prevPosX2 = prevPos; 2801 prevCharX2 = prevChar; 2802 2803 prevPos = pos; 2804 prevChar = thisChar; 2805 2806 pos = nextPos; 2807 thisChar = fText->char32At(pos); 2808 2809 nextCPPos = fText->moveIndex32(pos, 1); 2810 nextPos = nextCPPos; 2811 2812 // Rule LB2 - Break at end of text. 2813 if (pos >= fText->length()) { 2814 break; 2815 } 2816 2817 // Rule LB 9 - adjust for combining sequences. 2818 // We do this one out-of-order because the adjustment does not change anything 2819 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to 2820 // be applied. 2821 rule9Adjust(prevPos, &prevChar, &pos, &thisChar); 2822 nextCPPos = nextPos = fText->moveIndex32(pos, 1); 2823 c = fText->char32At(nextPos); 2824 rule9Adjust(pos, &thisChar, &nextPos, &c); 2825 2826 // If the loop is still warming up - if we haven't shifted the initial 2827 // -1 positions out of prevPos yet - loop back to advance the 2828 // position in the input without any further looking for breaks. 2829 if (prevPos == -1) { 2830 continue; 2831 } 2832 2833 // LB 4 Always break after hard line breaks, 2834 if (fBK->contains(prevChar)) { 2835 break; 2836 } 2837 2838 // LB 5 Break after CR, LF, NL, but not inside CR LF 2839 if (prevChar == 0x0d && thisChar == 0x0a) { 2840 continue; 2841 } 2842 if (prevChar == 0x0d || 2843 prevChar == 0x0a || 2844 prevChar == 0x85) { 2845 break; 2846 } 2847 2848 // LB 6 Don't break before hard line breaks 2849 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 || 2850 fBK->contains(thisChar)) { 2851 continue; 2852 } 2853 2854 2855 // LB 7 Don't break before spaces or zero-width space. 2856 if (fSP->contains(thisChar)) { 2857 continue; 2858 } 2859 2860 if (fZW->contains(thisChar)) { 2861 continue; 2862 } 2863 2864 // LB 8 Break after zero width space 2865 // ZW SP* 2866 // Scan backwards from prevChar for SP* ZW 2867 tPos = prevPos; 2868 while (tPos>0 && fSP->contains(fText->char32At(tPos))) { 2869 tPos = fText->moveIndex32(tPos, -1); 2870 } 2871 if (fZW->contains(fText->char32At(tPos))) { 2872 break; 2873 } 2874 2875 // LB 25 Numbers 2876 // Move this test up, before LB8a, because numbers can match a longer sequence that would 2877 // also match 8a. e.g. NU ZWJ IS PO (ZWJ acts like CM) 2878 if (fNumberMatcher->lookingAt(prevPos, status)) { 2879 if (U_FAILURE(status)) { 2880 break; 2881 } 2882 // Matched a number. But could have been just a single digit, which would 2883 // not represent a "no break here" between prevChar and thisChar 2884 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num 2885 if (numEndIdx > pos) { 2886 // Number match includes at least our two chars being checked 2887 if (numEndIdx > nextPos) { 2888 // Number match includes additional chars. Update pos and nextPos 2889 // so that next loop iteration will continue at the end of the number, 2890 // checking for breaks between last char in number & whatever follows. 2891 pos = nextPos = numEndIdx; 2892 do { 2893 pos = fText->moveIndex32(pos, -1); 2894 thisChar = fText->char32At(pos); 2895 } while (fCM->contains(thisChar)); 2896 } 2897 continue; 2898 } 2899 } 2900 2901 // LB 8a ZWJ x 2902 // The monkey test's way of ignoring combining characters doesn't work 2903 // for this rule. ZJ is also a CM. Need to get the actual character 2904 // preceding "thisChar", not ignoring combining marks, possibly ZJ. 2905 { 2906 int32_t prevIdx = fText->moveIndex32(pos, -1); 2907 UChar32 prevC = fText->char32At(prevIdx); 2908 if (fZWJ->contains(prevC)) { 2909 continue; 2910 } 2911 } 2912 2913 // LB 9, 10 Already done, at top of loop. 2914 // 2915 2916 2917 // LB 11 Do not break before or after WORD JOINER and related characters. 2918 // x WJ 2919 // WJ x 2920 // 2921 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) { 2922 continue; 2923 } 2924 2925 // LB 12 2926 // GL x 2927 if (fGL->contains(prevChar)) { 2928 continue; 2929 } 2930 2931 // LB 12a 2932 // [^SP BA HY] x GL 2933 if (!(fSP->contains(prevChar) || 2934 fBA->contains(prevChar) || 2935 fHY->contains(prevChar) ) && fGL->contains(thisChar)) { 2936 continue; 2937 } 2938 2939 2940 2941 // LB 13 Don't break before closings. 2942 // NU x CL, NU x CP and NU x IS are not matched here so that they will 2943 // fall into LB 17 and the more general number regular expression. 2944 // 2945 if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) || 2946 (!fNU->contains(prevChar) && fCP->contains(thisChar)) || 2947 fEX->contains(thisChar) || 2948 (!fNU->contains(prevChar) && fIS->contains(thisChar)) || 2949 (!fNU->contains(prevChar) && fSY->contains(thisChar))) { 2950 continue; 2951 } 2952 2953 // LB 14 Don't break after OP SP* 2954 // Scan backwards, checking for this sequence. 2955 // The OP char could include combining marks, so we actually check for 2956 // OP CM* SP* 2957 // Another Twist: The Rule 67 fixes may have changed a SP CM 2958 // sequence into a ID char, so before scanning back through spaces, 2959 // verify that prevChar is indeed a space. The prevChar variable 2960 // may differ from fText[prevPos] 2961 tPos = prevPos; 2962 if (fSP->contains(prevChar)) { 2963 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) { 2964 tPos=fText->moveIndex32(tPos, -1); 2965 } 2966 } 2967 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { 2968 tPos=fText->moveIndex32(tPos, -1); 2969 } 2970 if (fOP->contains(fText->char32At(tPos))) { 2971 continue; 2972 } 2973 2974 2975 // LB 15 QU SP* x OP 2976 if (fOP->contains(thisChar)) { 2977 // Scan backwards from prevChar to see if it is preceded by QU CM* SP* 2978 int tPos = prevPos; 2979 while (tPos>0 && fSP->contains(fText->char32At(tPos))) { 2980 tPos = fText->moveIndex32(tPos, -1); 2981 } 2982 while (tPos>0 && fCM->contains(fText->char32At(tPos))) { 2983 tPos = fText->moveIndex32(tPos, -1); 2984 } 2985 if (fQU->contains(fText->char32At(tPos))) { 2986 continue; 2987 } 2988 } 2989 2990 2991 2992 // LB 16 (CL | CP) SP* x NS 2993 // Scan backwards for SP* CM* (CL | CP) 2994 if (fNS->contains(thisChar)) { 2995 int tPos = prevPos; 2996 while (tPos>0 && fSP->contains(fText->char32At(tPos))) { 2997 tPos = fText->moveIndex32(tPos, -1); 2998 } 2999 while (tPos>0 && fCM->contains(fText->char32At(tPos))) { 3000 tPos = fText->moveIndex32(tPos, -1); 3001 } 3002 if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) { 3003 continue; 3004 } 3005 } 3006 3007 3008 // LB 17 B2 SP* x B2 3009 if (fB2->contains(thisChar)) { 3010 // Scan backwards, checking for the B2 CM* SP* sequence. 3011 tPos = prevPos; 3012 if (fSP->contains(prevChar)) { 3013 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) { 3014 tPos=fText->moveIndex32(tPos, -1); 3015 } 3016 } 3017 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { 3018 tPos=fText->moveIndex32(tPos, -1); 3019 } 3020 if (fB2->contains(fText->char32At(tPos))) { 3021 continue; 3022 } 3023 } 3024 3025 3026 // LB 18 break after space 3027 if (fSP->contains(prevChar)) { 3028 break; 3029 } 3030 3031 // LB 19 3032 // x QU 3033 // QU x 3034 if (fQU->contains(thisChar) || fQU->contains(prevChar)) { 3035 continue; 3036 } 3037 3038 // LB 20 Break around a CB 3039 if (fCB->contains(thisChar) || fCB->contains(prevChar)) { 3040 break; 3041 } 3042 3043 // LB 20.09 Don't break between Hyphens and letters if a break precedes the hyphen. 3044 // Formerly this was a Finnish tailoring. 3045 // Moved to root in ICU 63. This is an ICU customization, not in UAX-14. 3046 // ^($HY | $HH) $AL; 3047 if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) && 3048 prevPosX2 == -1) { 3049 continue; 3050 } 3051 3052 // LB 21 3053 if (fBA->contains(thisChar) || 3054 fHY->contains(thisChar) || 3055 fNS->contains(thisChar) || 3056 fBB->contains(prevChar) ) { 3057 continue; 3058 } 3059 3060 // LB 21a 3061 // HL (HY | BA) x 3062 if (fHL->contains(prevCharX2) && 3063 (fHY->contains(prevChar) || fBA->contains(prevChar))) { 3064 continue; 3065 } 3066 3067 // LB 21b 3068 // SY x HL 3069 if (fSY->contains(prevChar) && fHL->contains(thisChar)) { 3070 continue; 3071 } 3072 3073 // LB 22 3074 if ((fAL->contains(prevChar) && fIN->contains(thisChar)) || 3075 (fEX->contains(prevChar) && fIN->contains(thisChar)) || 3076 (fHL->contains(prevChar) && fIN->contains(thisChar)) || 3077 ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) && fIN->contains(thisChar)) || 3078 (fIN->contains(prevChar) && fIN->contains(thisChar)) || 3079 (fNU->contains(prevChar) && fIN->contains(thisChar)) ) { 3080 continue; 3081 } 3082 3083 3084 // LB 23 (AL | HL) x NU 3085 // NU x (AL | HL) 3086 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) { 3087 continue; 3088 } 3089 if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) { 3090 continue; 3091 } 3092 3093 // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes. 3094 // PR x (ID | EB | EM) 3095 // (ID | EB | EM) x PO 3096 if (fPR->contains(prevChar) && 3097 (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar))) { 3098 continue; 3099 } 3100 if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) && 3101 fPO->contains(thisChar)) { 3102 continue; 3103 } 3104 3105 // LB 24 Do not break between prefix and letters or ideographs. 3106 // (PR | PO) x (AL | HL) 3107 // (AL | HL) x (PR | PO) 3108 if ((fPR->contains(prevChar) || fPO->contains(prevChar)) && 3109 (fAL->contains(thisChar) || fHL->contains(thisChar))) { 3110 continue; 3111 } 3112 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && 3113 (fPR->contains(thisChar) || fPO->contains(thisChar))) { 3114 continue; 3115 } 3116 3117 // LB 25 numbers match, moved up, before LB 8a, 3118 3119 // LB 26 Do not break a Korean syllable. 3120 if (fJL->contains(prevChar) && (fJL->contains(thisChar) || 3121 fJV->contains(thisChar) || 3122 fH2->contains(thisChar) || 3123 fH3->contains(thisChar))) { 3124 continue; 3125 } 3126 3127 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) && 3128 (fJV->contains(thisChar) || fJT->contains(thisChar))) { 3129 continue; 3130 } 3131 3132 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) && 3133 fJT->contains(thisChar)) { 3134 continue; 3135 } 3136 3137 // LB 27 Treat a Korean Syllable Block the same as ID. 3138 if ((fJL->contains(prevChar) || fJV->contains(prevChar) || 3139 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) && 3140 fIN->contains(thisChar)) { 3141 continue; 3142 } 3143 if ((fJL->contains(prevChar) || fJV->contains(prevChar) || 3144 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) && 3145 fPO->contains(thisChar)) { 3146 continue; 3147 } 3148 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) || 3149 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) { 3150 continue; 3151 } 3152 3153 3154 3155 // LB 28 Do not break between alphabetics ("at"). 3156 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) { 3157 continue; 3158 } 3159 3160 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g."). 3161 if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) { 3162 continue; 3163 } 3164 3165 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation. 3166 // (AL | NU) x OP 3167 // CP x (AL | NU) 3168 if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) { 3169 continue; 3170 } 3171 if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) { 3172 continue; 3173 } 3174 3175 // LB30a RI RI RI 3176 // RI x RI 3177 if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) { 3178 break; 3179 } 3180 if (fRI->contains(prevChar) && fRI->contains(thisChar)) { 3181 // Two Regional Indicators have been paired. 3182 // Over-write the trailing one (thisChar) to prevent it from forming another pair with a 3183 // following RI. This is a hack. 3184 thisChar = -1; 3185 continue; 3186 } 3187 3188 // LB30b Emoji Base x Emoji Modifier 3189 if (fEB->contains(prevChar) && fEM->contains(thisChar)) { 3190 continue; 3191 } 3192 3193 // LB 31 Break everywhere else 3194 break; 3195 3196 } 3197 3198 return pos; 3199 } 3200 3201 3202 UVector *RBBILineMonkey::charClasses() { 3203 return fSets; 3204 } 3205 3206 3207 RBBILineMonkey::~RBBILineMonkey() { 3208 delete fSets; 3209 3210 delete fBK; 3211 delete fCR; 3212 delete fLF; 3213 delete fCM; 3214 delete fNL; 3215 delete fWJ; 3216 delete fZW; 3217 delete fGL; 3218 delete fCB; 3219 delete fSP; 3220 delete fB2; 3221 delete fBA; 3222 delete fBB; 3223 delete fHH; 3224 delete fHY; 3225 delete fH2; 3226 delete fH3; 3227 delete fCL; 3228 delete fCP; 3229 delete fEX; 3230 delete fIN; 3231 delete fJL; 3232 delete fJV; 3233 delete fJT; 3234 delete fNS; 3235 delete fOP; 3236 delete fQU; 3237 delete fIS; 3238 delete fNU; 3239 delete fPO; 3240 delete fPR; 3241 delete fSY; 3242 delete fAI; 3243 delete fAL; 3244 delete fCJ; 3245 delete fHL; 3246 delete fID; 3247 delete fRI; 3248 delete fSG; 3249 delete fXX; 3250 delete fEB; 3251 delete fEM; 3252 delete fZWJ; 3253 3254 delete fCharBI; 3255 delete fNumberMatcher; 3256 } 3257 3258 3259 //------------------------------------------------------------------------------------------- 3260 // 3261 // TestMonkey 3262 // 3263 // params 3264 // seed=nnnnn Random number starting seed. 3265 // Setting the seed allows errors to be reproduced. 3266 // loop=nnn Looping count. Controls running time. 3267 // -1: run forever. 3268 // 0 or greater: run length. 3269 // 3270 // type = char | word | line | sent | title 3271 // 3272 // Example: 3273 // intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1" 3274 // 3275 //------------------------------------------------------------------------------------------- 3276 3277 static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) { 3278 int32_t val = defaultVal; 3279 name.append(" *= *(-?\\d+)"); 3280 UErrorCode status = U_ZERO_ERROR; 3281 RegexMatcher m(name, params, 0, status); 3282 if (m.find()) { 3283 // The param exists. Convert the string to an int. 3284 char valString[100]; 3285 int32_t paramLength = m.end(1, status) - m.start(1, status); 3286 if (paramLength >= (int32_t)(sizeof(valString)-1)) { 3287 paramLength = (int32_t)(sizeof(valString)-2); 3288 } 3289 params.extract(m.start(1, status), paramLength, valString, sizeof(valString)); 3290 val = strtol(valString, NULL, 10); 3291 3292 // Delete this parameter from the params string. 3293 m.reset(); 3294 params = m.replaceFirst("", status); 3295 } 3296 U_ASSERT(U_SUCCESS(status)); 3297 return val; 3298 } 3299 #endif 3300 3301 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 3302 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr, 3303 BreakIterator *bi, 3304 int expected[], 3305 int expectedcount) 3306 { 3307 int count = 0; 3308 int i = 0; 3309 int forward[50]; 3310 bi->setText(ustr); 3311 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) { 3312 forward[count] = i; 3313 if (count < expectedcount && expected[count] != i) { 3314 test->errln("%s:%d break forward test failed: expected %d but got %d", 3315 __FILE__, __LINE__, expected[count], i); 3316 break; 3317 } 3318 count ++; 3319 } 3320 if (count != expectedcount) { 3321 printStringBreaks(ustr, expected, expectedcount); 3322 test->errln("%s:%d break forward test failed: missed %d match", 3323 __FILE__, __LINE__, expectedcount - count); 3324 return; 3325 } 3326 // testing boundaries 3327 for (i = 1; i < expectedcount; i ++) { 3328 int j = expected[i - 1]; 3329 if (!bi->isBoundary(j)) { 3330 printStringBreaks(ustr, expected, expectedcount); 3331 test->errln("%s:%d isBoundary() failed. Expected boundary at position %d", 3332 __FILE__, __LINE__, j); 3333 return; 3334 } 3335 for (j = expected[i - 1] + 1; j < expected[i]; j ++) { 3336 if (bi->isBoundary(j)) { 3337 printStringBreaks(ustr, expected, expectedcount); 3338 test->errln("%s:%d isBoundary() failed. Not expecting boundary at position %d", 3339 __FILE__, __LINE__, j); 3340 return; 3341 } 3342 } 3343 } 3344 3345 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) { 3346 count --; 3347 if (forward[count] != i) { 3348 printStringBreaks(ustr, expected, expectedcount); 3349 test->errln("%s:%d happy break test previous() failed: expected %d but got %d", 3350 __FILE__, __LINE__, forward[count], i); 3351 break; 3352 } 3353 } 3354 if (count != 0) { 3355 printStringBreaks(ustr, expected, expectedcount); 3356 test->errln("break test previous() failed: missed a match"); 3357 return; 3358 } 3359 3360 // testing preceding 3361 for (i = 0; i < expectedcount - 1; i ++) { 3362 // int j = expected[i] + 1; 3363 int j = ustr.moveIndex32(expected[i], 1); 3364 for (; j <= expected[i + 1]; j ++) { 3365 int32_t expectedPreceding = expected[i]; 3366 int32_t actualPreceding = bi->preceding(j); 3367 if (actualPreceding != expectedPreceding) { 3368 printStringBreaks(ustr, expected, expectedcount); 3369 test->errln("%s:%d preceding(%d): expected %d, got %d", 3370 __FILE__, __LINE__, j, expectedPreceding, actualPreceding); 3371 return; 3372 } 3373 } 3374 } 3375 } 3376 #endif 3377 3378 void RBBITest::TestWordBreaks(void) 3379 { 3380 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 3381 3382 Locale locale("en"); 3383 UErrorCode status = U_ZERO_ERROR; 3384 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 3385 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 3386 // Replaced any C+J characters in a row with a random sequence of characters 3387 // of the same length to make our C+J segmentation not get in the way. 3388 static const char *strlist[] = 3389 { 3390 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d", 3391 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b", 3392 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a", 3393 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622", 3394 "\\uac00\\u3588\\u009c\\u0953\\u194b", 3395 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 3396 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e", 3397 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e", 3398 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 3399 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 3400 "\\u2027\\U000e0067\\u0a47\\u00b7", 3401 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 3402 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 3403 "\\u0589\\U000e006e\\u0a42\\U000104a5", 3404 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a", 3405 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 3406 "\\u0027\\u11af\\U000e0057\\u0602", 3407 "\\U0001d7f2\\U000e007\\u0004\\u0589", 3408 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b", 3409 "\\U0001d7f2\\U000e007d\\u0004\\u0589", 3410 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 3411 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 3412 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068", 3413 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 3414 "\\u0233\\U000e0020\\u0a69\\u0d6a", 3415 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 3416 "\\u18f4\\U000e0049\\u20e7\\u2027", 3417 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 3418 "\\ua183\\u102d\\u0bec\\u003a", 3419 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 3420 "\\u003a\\u0e57\\u0fad\\u002e", 3421 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", 3422 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", 3423 "\\U000e005d\\u2044\\u0731\\u0650\\u0061", 3424 "\\u003a\\u0664\\u00b7\\u1fba", 3425 "\\u003b\\u0027\\u00b7\\u47a3", 3426 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b", 3427 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673", 3428 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c", 3429 }; 3430 int loop; 3431 if (U_FAILURE(status)) { 3432 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 3433 return; 3434 } 3435 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) { 3436 // printf("looping %d\n", loop); 3437 UnicodeString ustr = CharsToUnicodeString(strlist[loop]); 3438 // RBBICharMonkey monkey; 3439 RBBIWordMonkey monkey; 3440 3441 int expected[50]; 3442 int expectedcount = 0; 3443 3444 monkey.setText(ustr); 3445 int i; 3446 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 3447 expected[expectedcount ++] = i; 3448 } 3449 3450 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 3451 } 3452 delete bi; 3453 #endif 3454 } 3455 3456 void RBBITest::TestWordBoundary(void) 3457 { 3458 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data> 3459 Locale locale("en"); 3460 UErrorCode status = U_ZERO_ERROR; 3461 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 3462 LocalPointer<BreakIterator> bi(BreakIterator::createWordInstance(locale, status), status); 3463 if (U_FAILURE(status)) { 3464 errcheckln(status, "%s:%d Creation of break iterator failed %s", 3465 __FILE__, __LINE__, u_errorName(status)); 3466 return; 3467 } 3468 UChar str[50]; 3469 static const char *strlist[] = 3470 { 3471 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 3472 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 3473 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 3474 "\\u2027\\U000e0067\\u0a47\\u00b7", 3475 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 3476 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 3477 "\\u0589\\U000e006e\\u0a42\\U000104a5", 3478 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a", 3479 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 3480 "\\u0027\\u11af\\U000e0057\\u0602", 3481 "\\U0001d7f2\\U000e007\\u0004\\u0589", 3482 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b", 3483 "\\U0001d7f2\\U000e007d\\u0004\\u0589", 3484 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 3485 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 3486 "\\U000e0065\\u302c\\u09ee\\U000e0068", 3487 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 3488 "\\u0233\\U000e0020\\u0a69\\u0d6a", 3489 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 3490 "\\u58f4\\U000e0049\\u20e7\\u2027", 3491 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 3492 "\\ua183\\u102d\\u0bec\\u003a", 3493 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 3494 "\\u003a\\u0e57\\u0fad\\u002e", 3495 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", 3496 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", 3497 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019", 3498 "\\u003a\\u0664\\u00b7\\u1fba", 3499 "\\u003b\\u0027\\u00b7\\u47a3", 3500 }; 3501 int loop; 3502 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) { 3503 u_unescape(strlist[loop], str, UPRV_LENGTHOF(str)); 3504 UnicodeString ustr(str); 3505 int forward[50]; 3506 int count = 0; 3507 3508 bi->setText(ustr); 3509 int prev = -1; 3510 for (int32_t boundary = bi->first(); boundary != BreakIterator::DONE; boundary = bi->next()) { 3511 ++count; 3512 if (count >= UPRV_LENGTHOF(forward)) { 3513 errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)", 3514 __FILE__, __LINE__, loop, count, boundary); 3515 return; 3516 } 3517 forward[count] = boundary; 3518 if (boundary <= prev) { 3519 errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n", 3520 __FILE__, __LINE__, loop, prev, boundary); 3521 break; 3522 } 3523 for (int32_t nonBoundary = prev + 1; nonBoundary < boundary; nonBoundary ++) { 3524 if (bi->isBoundary(nonBoundary)) { 3525 printStringBreaks(ustr, forward, count); 3526 errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)", 3527 __FILE__, __LINE__, loop, prev, nonBoundary, boundary); 3528 return; 3529 } 3530 } 3531 if (!bi->isBoundary(boundary)) { 3532 printStringBreaks(ustr, forward, count); 3533 errln("%s:%d happy boundary test failed: expected %d a boundary", 3534 __FILE__, __LINE__, boundary); 3535 return; 3536 } 3537 prev = boundary; 3538 } 3539 } 3540 } 3541 3542 void RBBITest::TestLineBreaks(void) 3543 { 3544 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 3545 Locale locale("en"); 3546 UErrorCode status = U_ZERO_ERROR; 3547 BreakIterator *bi = BreakIterator::createLineInstance(locale, status); 3548 const int32_t STRSIZE = 50; 3549 UChar str[STRSIZE]; 3550 static const char *strlist[] = 3551 { 3552 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc", 3553 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\" 3554 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d", 3555 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\" 3556 "u2014\\U000e0105\\u118c\\u000a\\u07f8", 3557 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f", 3558 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", 3559 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4", 3560 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", 3561 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060", 3562 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f", 3563 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1", 3564 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5", 3565 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0", 3566 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc", 3567 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f", 3568 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f", 3569 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b", 3570 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085", 3571 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac", 3572 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9", 3573 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025", 3574 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763", 3575 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029", 3576 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7", 3577 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a", 3578 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945", 3579 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014", 3580 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b", 3581 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0", 3582 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d", 3583 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111", 3584 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f", 3585 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010", 3586 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43", 3587 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb", 3588 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc", 3589 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060", 3590 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0", 3591 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07", 3592 }; 3593 int loop; 3594 TEST_ASSERT_SUCCESS(status); 3595 if (U_FAILURE(status)) { 3596 return; 3597 } 3598 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) { 3599 // printf("looping %d\n", loop); 3600 int32_t t = u_unescape(strlist[loop], str, STRSIZE); 3601 if (t >= STRSIZE) { 3602 TEST_ASSERT(FALSE); 3603 continue; 3604 } 3605 3606 3607 UnicodeString ustr(str); 3608 RBBILineMonkey monkey; 3609 if (U_FAILURE(monkey.deferredStatus)) { 3610 continue; 3611 } 3612 3613 const int EXPECTEDSIZE = 50; 3614 int expected[EXPECTEDSIZE]; 3615 int expectedcount = 0; 3616 3617 monkey.setText(ustr); 3618 int i; 3619 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 3620 if (expectedcount >= EXPECTEDSIZE) { 3621 TEST_ASSERT(expectedcount < EXPECTEDSIZE); 3622 return; 3623 } 3624 expected[expectedcount ++] = i; 3625 } 3626 3627 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 3628 } 3629 delete bi; 3630 #endif 3631 } 3632 3633 void RBBITest::TestSentBreaks(void) 3634 { 3635 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 3636 Locale locale("en"); 3637 UErrorCode status = U_ZERO_ERROR; 3638 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status); 3639 UChar str[200]; 3640 static const char *strlist[] = 3641 { 3642 "Now\ris\nthe\r\ntime\n\rfor\r\r", 3643 "This\n", 3644 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.", 3645 "\"Sentence ending with a quote.\" Bye.", 3646 " (This is it). Testing the sentence iterator. \"This isn't it.\"", 3647 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"", 3648 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ", 3649 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ", 3650 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ", 3651 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!", 3652 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52" 3653 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a" 3654 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f" 3655 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352", 3656 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171" 3657 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030" 3658 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b" 3659 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b" 3660 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05" 3661 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4" 3662 }; 3663 int loop; 3664 if (U_FAILURE(status)) { 3665 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 3666 return; 3667 } 3668 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) { 3669 u_unescape(strlist[loop], str, UPRV_LENGTHOF(str)); 3670 UnicodeString ustr(str); 3671 3672 RBBISentMonkey monkey; 3673 if (U_FAILURE(monkey.deferredStatus)) { 3674 continue; 3675 } 3676 3677 const int EXPECTEDSIZE = 50; 3678 int expected[EXPECTEDSIZE]; 3679 int expectedcount = 0; 3680 3681 monkey.setText(ustr); 3682 int i; 3683 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 3684 if (expectedcount >= EXPECTEDSIZE) { 3685 TEST_ASSERT(expectedcount < EXPECTEDSIZE); 3686 return; 3687 } 3688 expected[expectedcount ++] = i; 3689 } 3690 3691 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 3692 } 3693 delete bi; 3694 #endif 3695 } 3696 3697 void RBBITest::TestMonkey() { 3698 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 3699 3700 UErrorCode status = U_ZERO_ERROR; 3701 int32_t loopCount = 500; 3702 int32_t seed = 1; 3703 UnicodeString breakType = "all"; 3704 Locale locale("en"); 3705 UBool useUText = FALSE; 3706 3707 if (quick == FALSE) { 3708 loopCount = 10000; 3709 } 3710 3711 if (fTestParams) { 3712 UnicodeString p(fTestParams); 3713 loopCount = getIntParam("loop", p, loopCount); 3714 seed = getIntParam("seed", p, seed); 3715 3716 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status); 3717 if (m.find()) { 3718 breakType = m.group(1, status); 3719 m.reset(); 3720 p = m.replaceFirst("", status); 3721 } 3722 3723 RegexMatcher u(" *utext", p, 0, status); 3724 if (u.find()) { 3725 useUText = TRUE; 3726 u.reset(); 3727 p = u.replaceFirst("", status); 3728 } 3729 3730 3731 // m.reset(p); 3732 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) { 3733 // Each option is stripped out of the option string as it is processed. 3734 // All options have been checked. The option string should have been completely emptied.. 3735 char buf[100]; 3736 p.extract(buf, sizeof(buf), NULL, status); 3737 buf[sizeof(buf)-1] = 0; 3738 errln("Unrecognized or extra parameter: %s\n", buf); 3739 return; 3740 } 3741 3742 } 3743 3744 if (breakType == "char" || breakType == "all") { 3745 RBBICharMonkey m; 3746 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 3747 if (U_SUCCESS(status)) { 3748 RunMonkey(bi, m, "char", seed, loopCount, useUText); 3749 if (breakType == "all" && useUText==FALSE) { 3750 // Also run a quick test with UText when "all" is specified 3751 RunMonkey(bi, m, "char", seed, loopCount, TRUE); 3752 } 3753 } 3754 else { 3755 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status)); 3756 } 3757 delete bi; 3758 } 3759 3760 if (breakType == "word" || breakType == "all") { 3761 logln("Word Break Monkey Test"); 3762 RBBIWordMonkey m; 3763 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 3764 if (U_SUCCESS(status)) { 3765 RunMonkey(bi, m, "word", seed, loopCount, useUText); 3766 } 3767 else { 3768 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status)); 3769 } 3770 delete bi; 3771 } 3772 3773 if (breakType == "line" || breakType == "all") { 3774 logln("Line Break Monkey Test"); 3775 RBBILineMonkey m; 3776 BreakIterator *bi = BreakIterator::createLineInstance(locale, status); 3777 if (loopCount >= 10) { 3778 loopCount = loopCount / 5; // Line break runs slower than the others. 3779 } 3780 if (U_SUCCESS(status)) { 3781 RunMonkey(bi, m, "line", seed, loopCount, useUText); 3782 } 3783 else { 3784 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); 3785 } 3786 delete bi; 3787 } 3788 3789 if (breakType == "sent" || breakType == "all" ) { 3790 logln("Sentence Break Monkey Test"); 3791 RBBISentMonkey m; 3792 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status); 3793 if (loopCount >= 10) { 3794 loopCount = loopCount / 10; // Sentence runs slower than the other break types 3795 } 3796 if (U_SUCCESS(status)) { 3797 RunMonkey(bi, m, "sentence", seed, loopCount, useUText); 3798 } 3799 else { 3800 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); 3801 } 3802 delete bi; 3803 } 3804 3805 #endif 3806 } 3807 3808 // 3809 // Run a RBBI monkey test. Common routine, for all break iterator types. 3810 // Parameters: 3811 // bi - the break iterator to use 3812 // mk - MonkeyKind, abstraction for obtaining expected results 3813 // name - Name of test (char, word, etc.) for use in error messages 3814 // seed - Seed for starting random number generator (parameter from user) 3815 // numIterations 3816 // 3817 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed, 3818 int32_t numIterations, UBool useUText) { 3819 3820 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 3821 3822 const int32_t TESTSTRINGLEN = 500; 3823 UnicodeString testText; 3824 int32_t numCharClasses; 3825 UVector *chClasses; 3826 int expected[TESTSTRINGLEN*2 + 1]; 3827 int expectedCount = 0; 3828 char expectedBreaks[TESTSTRINGLEN*2 + 1]; 3829 char forwardBreaks[TESTSTRINGLEN*2 + 1]; 3830 char reverseBreaks[TESTSTRINGLEN*2+1]; 3831 char isBoundaryBreaks[TESTSTRINGLEN*2+1]; 3832 char followingBreaks[TESTSTRINGLEN*2+1]; 3833 char precedingBreaks[TESTSTRINGLEN*2+1]; 3834 int i; 3835 int loopCount = 0; 3836 3837 m_seed = seed; 3838 3839 numCharClasses = mk.charClasses()->size(); 3840 chClasses = mk.charClasses(); 3841 3842 // Check for errors that occured during the construction of the MonkeyKind object. 3843 // Can't report them where they occured because errln() is a method coming from intlTest, 3844 // and is not visible outside of RBBITest :-( 3845 if (U_FAILURE(mk.deferredStatus)) { 3846 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus)); 3847 return; 3848 } 3849 3850 // Verify that the character classes all have at least one member. 3851 for (i=0; i<numCharClasses; i++) { 3852 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i); 3853 if (s == NULL || s->size() == 0) { 3854 errln("Character Class #%d is null or of zero size.", i); 3855 return; 3856 } 3857 } 3858 3859 while (loopCount < numIterations || numIterations == -1) { 3860 if (numIterations == -1 && loopCount % 10 == 0) { 3861 // If test is running in an infinite loop, display a periodic tic so 3862 // we can tell that it is making progress. 3863 fprintf(stderr, "."); 3864 } 3865 // Save current random number seed, so that we can recreate the random numbers 3866 // for this loop iteration in event of an error. 3867 seed = m_seed; 3868 3869 // Populate a test string with data. 3870 testText.truncate(0); 3871 for (i=0; i<TESTSTRINGLEN; i++) { 3872 int32_t aClassNum = m_rand() % numCharClasses; 3873 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum); 3874 int32_t charIdx = m_rand() % classSet->size(); 3875 UChar32 c = classSet->charAt(charIdx); 3876 if (c < 0) { // TODO: deal with sets containing strings. 3877 errln("%s:%d c < 0", __FILE__, __LINE__); 3878 break; 3879 } 3880 // Do not assemble a supplementary character from randomly generated separate surrogates. 3881 // (It could be a dictionary character) 3882 if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) { 3883 continue; 3884 } 3885 3886 testText.append(c); 3887 } 3888 3889 // Calculate the expected results for this test string. 3890 mk.setText(testText); 3891 memset(expectedBreaks, 0, sizeof(expectedBreaks)); 3892 expectedBreaks[0] = 1; 3893 int32_t breakPos = 0; 3894 expectedCount = 0; 3895 for (;;) { 3896 breakPos = mk.next(breakPos); 3897 if (breakPos == -1) { 3898 break; 3899 } 3900 if (breakPos > testText.length()) { 3901 errln("breakPos > testText.length()"); 3902 } 3903 expectedBreaks[breakPos] = 1; 3904 U_ASSERT(expectedCount<testText.length()); 3905 expected[expectedCount ++] = breakPos; 3906 (void)expected; // Set but not used warning. 3907 // TODO (andy): check it out. 3908 } 3909 3910 // Find the break positions using forward iteration 3911 memset(forwardBreaks, 0, sizeof(forwardBreaks)); 3912 if (useUText) { 3913 UErrorCode status = U_ZERO_ERROR; 3914 UText *testUText = utext_openReplaceable(NULL, &testText, &status); 3915 // testUText = utext_openUnicodeString(testUText, &testText, &status); 3916 bi->setText(testUText, status); 3917 TEST_ASSERT_SUCCESS(status); 3918 utext_close(testUText); // The break iterator does a shallow clone of the UText 3919 // This UText can be closed immediately, so long as the 3920 // testText string continues to exist. 3921 } else { 3922 bi->setText(testText); 3923 } 3924 3925 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) { 3926 if (i < 0 || i > testText.length()) { 3927 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name); 3928 break; 3929 } 3930 forwardBreaks[i] = 1; 3931 } 3932 3933 // Find the break positions using reverse iteration 3934 memset(reverseBreaks, 0, sizeof(reverseBreaks)); 3935 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) { 3936 if (i < 0 || i > testText.length()) { 3937 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name); 3938 break; 3939 } 3940 reverseBreaks[i] = 1; 3941 } 3942 3943 // Find the break positions using isBoundary() tests. 3944 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks)); 3945 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length()); 3946 for (i=0; i<=testText.length(); i++) { 3947 isBoundaryBreaks[i] = bi->isBoundary(i); 3948 } 3949 3950 3951 // Find the break positions using the following() function. 3952 // printf("."); 3953 memset(followingBreaks, 0, sizeof(followingBreaks)); 3954 int32_t lastBreakPos = 0; 3955 followingBreaks[0] = 1; 3956 for (i=0; i<testText.length(); i++) { 3957 breakPos = bi->following(i); 3958 if (breakPos <= i || 3959 breakPos < lastBreakPos || 3960 breakPos > testText.length() || 3961 (breakPos > lastBreakPos && lastBreakPos > i)) { 3962 errln("%s break monkey test: " 3963 "Out of range value returned by BreakIterator::following().\n" 3964 "Random seed=%d index=%d; following returned %d; lastbreak=%d", 3965 name, seed, i, breakPos, lastBreakPos); 3966 break; 3967 } 3968 followingBreaks[breakPos] = 1; 3969 lastBreakPos = breakPos; 3970 } 3971 3972 // Find the break positions using the preceding() function. 3973 memset(precedingBreaks, 0, sizeof(precedingBreaks)); 3974 lastBreakPos = testText.length(); 3975 precedingBreaks[testText.length()] = 1; 3976 for (i=testText.length(); i>0; i--) { 3977 breakPos = bi->preceding(i); 3978 if (breakPos >= i || 3979 breakPos > lastBreakPos || 3980 (breakPos < 0 && testText.getChar32Start(i)>0) || 3981 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) { 3982 errln("%s break monkey test: " 3983 "Out of range value returned by BreakIterator::preceding().\n" 3984 "index=%d; prev returned %d; lastBreak=%d" , 3985 name, i, breakPos, lastBreakPos); 3986 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) { 3987 precedingBreaks[i] = 2; // Forces an error. 3988 } 3989 } else { 3990 if (breakPos >= 0) { 3991 precedingBreaks[breakPos] = 1; 3992 } 3993 lastBreakPos = breakPos; 3994 } 3995 } 3996 3997 // Compare the expected and actual results. 3998 for (i=0; i<=testText.length(); i++) { 3999 const char *errorType = NULL; 4000 if (forwardBreaks[i] != expectedBreaks[i]) { 4001 errorType = "next()"; 4002 } else if (reverseBreaks[i] != forwardBreaks[i]) { 4003 errorType = "previous()"; 4004 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) { 4005 errorType = "isBoundary()"; 4006 } else if (followingBreaks[i] != expectedBreaks[i]) { 4007 errorType = "following()"; 4008 } else if (precedingBreaks[i] != expectedBreaks[i]) { 4009 errorType = "preceding()"; 4010 } 4011 4012 4013 if (errorType != NULL) { 4014 // Format a range of the test text that includes the failure as 4015 // a data item that can be included in the rbbi test data file. 4016 4017 // Start of the range is the last point where expected and actual results 4018 // both agreed that there was a break position. 4019 int startContext = i; 4020 int32_t count = 0; 4021 for (;;) { 4022 if (startContext==0) { break; } 4023 startContext --; 4024 if (expectedBreaks[startContext] != 0) { 4025 if (count == 2) break; 4026 count ++; 4027 } 4028 } 4029 4030 // End of range is two expected breaks past the start position. 4031 int endContext = i + 1; 4032 int ci; 4033 for (ci=0; ci<2; ci++) { // Number of items to include in error text. 4034 for (;;) { 4035 if (endContext >= testText.length()) {break;} 4036 if (expectedBreaks[endContext-1] != 0) { 4037 if (count == 0) break; 4038 count --; 4039 } 4040 endContext ++; 4041 } 4042 } 4043 4044 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>" 4045 UnicodeString errorText = "<data>"; 4046 /***if (strcmp(errorType, "next()") == 0) { 4047 startContext = 0; 4048 endContext = testText.length(); 4049 4050 printStringBreaks(testText, expected, expectedCount); 4051 }***/ 4052 4053 for (ci=startContext; ci<endContext;) { 4054 UnicodeString hexChars("0123456789abcdef"); 4055 UChar32 c; 4056 int bn; 4057 c = testText.char32At(ci); 4058 if (ci == i) { 4059 // This is the location of the error. 4060 errorText.append("<?>"); 4061 } else if (expectedBreaks[ci] != 0) { 4062 // This a non-error expected break position. 4063 errorText.append("\\"); 4064 } 4065 if (c < 0x10000) { 4066 errorText.append("\\u"); 4067 for (bn=12; bn>=0; bn-=4) { 4068 errorText.append(hexChars.charAt((c>>bn)&0xf)); 4069 } 4070 } else { 4071 errorText.append("\\U"); 4072 for (bn=28; bn>=0; bn-=4) { 4073 errorText.append(hexChars.charAt((c>>bn)&0xf)); 4074 } 4075 } 4076 ci = testText.moveIndex32(ci, 1); 4077 } 4078 errorText.append("\\"); 4079 errorText.append("</data>\n"); 4080 4081 // Output the error 4082 char charErrorTxt[500]; 4083 UErrorCode status = U_ZERO_ERROR; 4084 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status); 4085 charErrorTxt[sizeof(charErrorTxt)-1] = 0; 4086 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status); 4087 4088 errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s", 4089 name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"), 4090 errorType, seed, i, charErrorTxt); 4091 break; 4092 } 4093 } 4094 4095 loopCount++; 4096 } 4097 #endif 4098 } 4099 4100 4101 // Bug 5532. UTF-8 based UText fails in dictionary code. 4102 // This test checks the initial patch, 4103 // which is to just keep it from crashing. Correct word boundaries 4104 // await a proper fix to the dictionary code. 4105 // 4106 void RBBITest::TestBug5532(void) { 4107 // Text includes a mixture of Thai and Latin. 4108 const unsigned char utf8Data[] = { 4109 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u, 4110 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u, 4111 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u, 4112 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 4113 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u, 4114 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u, 4115 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu, 4116 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u, 4117 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 4118 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u, 4119 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00}; 4120 4121 UErrorCode status = U_ZERO_ERROR; 4122 UText utext=UTEXT_INITIALIZER; 4123 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status); 4124 TEST_ASSERT_SUCCESS(status); 4125 4126 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status); 4127 TEST_ASSERT_SUCCESS(status); 4128 if (U_SUCCESS(status)) { 4129 bi->setText(&utext, status); 4130 TEST_ASSERT_SUCCESS(status); 4131 4132 int32_t breakCount = 0; 4133 int32_t previousBreak = -1; 4134 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) { 4135 // For now, just make sure that the break iterator doesn't hang. 4136 TEST_ASSERT(previousBreak < bi->current()); 4137 previousBreak = bi->current(); 4138 } 4139 TEST_ASSERT(breakCount > 0); 4140 } 4141 delete bi; 4142 utext_close(&utext); 4143 } 4144 4145 4146 void RBBITest::TestBug9983(void) { 4147 UnicodeString text = UnicodeString("\\u002A" // * Other 4148 "\\uFF65" // Other 4149 "\\u309C" // Katakana 4150 "\\uFF9F" // Extend 4151 "\\uFF65" // Other 4152 "\\u0020" // Other 4153 "\\u0000").unescape(); 4154 4155 UErrorCode status = U_ZERO_ERROR; 4156 LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>( 4157 BreakIterator::createWordInstance(Locale::getRoot(), status))); 4158 TEST_ASSERT_SUCCESS(status); 4159 LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>( 4160 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status))); 4161 TEST_ASSERT_SUCCESS(status); 4162 if (U_FAILURE(status)) { 4163 return; 4164 } 4165 int32_t offset, rstatus, iterationCount; 4166 4167 brkiter->setText(text); 4168 brkiter->last(); 4169 iterationCount = 0; 4170 while ( (offset = brkiter->previous()) != UBRK_DONE ) { 4171 iterationCount++; 4172 rstatus = brkiter->getRuleStatus(); 4173 (void)rstatus; // Suppress set but not used warning. 4174 if (iterationCount >= 10) { 4175 break; 4176 } 4177 } 4178 TEST_ASSERT(iterationCount == 6); 4179 4180 brkiterPOSIX->setText(text); 4181 brkiterPOSIX->last(); 4182 iterationCount = 0; 4183 while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) { 4184 iterationCount++; 4185 rstatus = brkiterPOSIX->getRuleStatus(); 4186 (void)rstatus; // Suppress set but not used warning. 4187 if (iterationCount >= 10) { 4188 break; 4189 } 4190 } 4191 TEST_ASSERT(iterationCount == 6); 4192 } 4193 4194 // Bug 7547 - verify that building a break itereator from empty rules produces an error. 4195 // 4196 void RBBITest::TestBug7547() { 4197 UnicodeString rules; 4198 UErrorCode status = U_ZERO_ERROR; 4199 UParseError parseError; 4200 RuleBasedBreakIterator breakIterator(rules, parseError, status); 4201 if (status != U_BRK_RULE_SYNTAX) { 4202 errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status)); 4203 } 4204 if (parseError.line != 1 || parseError.offset != 0) { 4205 errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset); 4206 } 4207 } 4208 4209 4210 void RBBITest::TestBug12797() { 4211 UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;"; 4212 UErrorCode status = U_ZERO_ERROR; 4213 UParseError parseError; 4214 RuleBasedBreakIterator bi(rules, parseError, status); 4215 if (U_FAILURE(status)) { 4216 errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status)); 4217 return; 4218 } 4219 UnicodeString text = "abc"; 4220 bi.setText(text); 4221 bi.first(); 4222 int32_t boundary = bi.next(); 4223 if (boundary != 3) { 4224 errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary); 4225 } 4226 } 4227 4228 void RBBITest::TestBug12918() { 4229 // This test triggers an assertion failure in dictbe.cpp 4230 const UChar *crasherString = u"\u3325\u4a16"; 4231 UErrorCode status = U_ZERO_ERROR; 4232 UBreakIterator* iter = ubrk_open(UBRK_WORD, NULL, crasherString, -1, &status); 4233 if (U_FAILURE(status)) { 4234 dataerrln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status)); 4235 return; 4236 } 4237 ubrk_first(iter); 4238 int32_t pos = 0; 4239 int32_t lastPos = -1; 4240 while((pos = ubrk_next(iter)) != UBRK_DONE) { 4241 if (pos <= lastPos) { 4242 errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos); 4243 break; 4244 } 4245 } 4246 ubrk_close(iter); 4247 } 4248 4249 void RBBITest::TestBug12932() { 4250 // Node Stack overflow in the RBBI rule parser caused a seg fault. 4251 UnicodeString ruleStr( 4252 "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((" 4253 "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((" 4254 "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))" 4255 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))" 4256 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))" 4257 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"); 4258 4259 UErrorCode status = U_ZERO_ERROR; 4260 UParseError parseError; 4261 RuleBasedBreakIterator rbbi(ruleStr, parseError, status); 4262 if (status != U_BRK_RULE_SYNTAX) { 4263 errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s", 4264 __FILE__, __LINE__, u_errorName(status)); 4265 } 4266 } 4267 4268 4269 // Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt 4270 // remain undevided by ICU char, word and line break. 4271 void RBBITest::TestEmoji() { 4272 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 4273 UErrorCode status = U_ZERO_ERROR; 4274 4275 CharString testFileName; 4276 testFileName.append(IntlTest::getSourceTestData(status), status); 4277 testFileName.appendPathPart("emoji-test.txt", status); 4278 if (U_FAILURE(status)) { 4279 errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status)); 4280 return; 4281 } 4282 logln("Opening data file %s\n", testFileName.data()); 4283 4284 int len; 4285 UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status); 4286 if (U_FAILURE(status) || testFile == NULL) { 4287 errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status)); 4288 return; 4289 } 4290 UnicodeString testFileAsString(testFile, len); 4291 delete [] testFile; 4292 4293 RegexMatcher lineMatcher(u"^.*?$", testFileAsString, UREGEX_MULTILINE, status); 4294 RegexMatcher hexMatcher(u"\\s*([a-f0-9]*)", UREGEX_CASE_INSENSITIVE, status); 4295 // hexMatcher group(1) is a hex number, or empty string if no hex number present. 4296 int32_t lineNumber = 0; 4297 4298 LocalPointer<BreakIterator> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status); 4299 LocalPointer<BreakIterator> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status), status); 4300 LocalPointer<BreakIterator> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status), status); 4301 if (U_FAILURE(status)) { 4302 dataerrln("%s:%d %s while opening break iterators", __FILE__, __LINE__, u_errorName(status)); 4303 return; 4304 } 4305 4306 while (lineMatcher.find()) { 4307 ++lineNumber; 4308 UnicodeString line = lineMatcher.group(status); 4309 hexMatcher.reset(line); 4310 UnicodeString testString; // accumulates the emoji sequence. 4311 while (hexMatcher.find() && hexMatcher.group(1, status).length() > 0) { 4312 UnicodeString hex = hexMatcher.group(1, status); 4313 if (hex.length() > 8) { 4314 errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__, __LINE__, lineNumber, CStr(hex)()); 4315 break; 4316 } 4317 CharString hex8; 4318 hex8.appendInvariantChars(hex, status); 4319 UChar32 c = (UChar32)strtol(hex8.data(), NULL, 16); 4320 if (c<=0x10ffff) { 4321 testString.append(c); 4322 } else { 4323 errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.", 4324 __FILE__, __LINE__, lineNumber, hex8.data()); 4325 break; 4326 } 4327 } 4328 4329 if (testString.length() > 1) { 4330 charBreaks->setText(testString); 4331 charBreaks->first(); 4332 int32_t firstBreak = charBreaks->next(); 4333 if (testString.length() != firstBreak) { 4334 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d", 4335 __FILE__, __LINE__, lineNumber, firstBreak); 4336 } 4337 wordBreaks->setText(testString); 4338 wordBreaks->first(); 4339 firstBreak = wordBreaks->next(); 4340 if (testString.length() != firstBreak) { 4341 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d", 4342 __FILE__, __LINE__, lineNumber, firstBreak); 4343 } 4344 lineBreaks->setText(testString); 4345 lineBreaks->first(); 4346 firstBreak = lineBreaks->next(); 4347 if (testString.length() != firstBreak) { 4348 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d", 4349 __FILE__, __LINE__, lineNumber, firstBreak); 4350 } 4351 } 4352 } 4353 #endif 4354 } 4355 4356 4357 // TestBug12519 - Correct handling of Locales by assignment / copy / clone 4358 4359 // WHERE Macro yields a literal string of the form "source_file_name:line number " 4360 // TODO: propose something equivalent as a test framework addition. 4361 4362 #define WHERE __FILE__ ":" XLINE(__LINE__) " " 4363 #define XLINE(s) LINE(s) 4364 #define LINE(s) #s 4365 4366 void RBBITest::TestBug12519() { 4367 UErrorCode status = U_ZERO_ERROR; 4368 LocalPointer<RuleBasedBreakIterator> biEn((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status)); 4369 LocalPointer<RuleBasedBreakIterator> biFr((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getFrance(), status)); 4370 if (!assertSuccess(WHERE, status)) { 4371 dataerrln("%s %d status = %s", __FILE__, __LINE__, u_errorName(status)); 4372 return; 4373 } 4374 assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status)); 4375 4376 assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status)); 4377 assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr); 4378 4379 LocalPointer<RuleBasedBreakIterator>cloneEn((RuleBasedBreakIterator *)biEn->clone()); 4380 assertTrue(WHERE, *biEn == *cloneEn); 4381 assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status)); 4382 4383 LocalPointer<RuleBasedBreakIterator>cloneFr((RuleBasedBreakIterator *)biFr->clone()); 4384 assertTrue(WHERE, *biFr == *cloneFr); 4385 assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status)); 4386 4387 LocalPointer<RuleBasedBreakIterator>biDe((RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getGerman(), status)); 4388 UnicodeString text("Hallo Welt"); 4389 biDe->setText(text); 4390 assertTrue(WHERE "before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr != *biDe); 4391 *biDe = *biFr; 4392 assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe); 4393 } 4394 4395 void RBBITest::TestBug12677() { 4396 // Check that stripping of comments from rules for getRules() is not confused by 4397 // the presence of '#' characters in the rules that do not introduce comments. 4398 UnicodeString rules(u"!!forward; \n" 4399 "$x = [ab#]; # a set with a # literal. \n" 4400 " # .; # a comment that looks sort of like a rule. \n" 4401 " '#' '?'; # a rule with a quoted # \n" 4402 ); 4403 4404 UErrorCode status = U_ZERO_ERROR; 4405 UParseError pe; 4406 RuleBasedBreakIterator bi(rules, pe, status); 4407 assertSuccess(WHERE, status); 4408 UnicodeString rtRules = bi.getRules(); 4409 assertEquals(WHERE, UnicodeString(u"!!forward; $x = [ab#]; '#' '?'; "), rtRules); 4410 } 4411 4412 4413 void RBBITest::TestTableRedundancies() { 4414 UErrorCode status = U_ZERO_ERROR; 4415 4416 LocalPointer<RuleBasedBreakIterator> bi ( 4417 (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status)); 4418 assertSuccess(WHERE, status); 4419 if (U_FAILURE(status)) return; 4420 4421 RBBIDataWrapper *dw = bi->fData; 4422 const RBBIStateTable *fwtbl = dw->fForwardTable; 4423 int32_t numCharClasses = dw->fHeader->fCatCount; 4424 // printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates); 4425 4426 // Check for duplicate columns (character categories) 4427 4428 std::vector<UnicodeString> columns; 4429 for (int32_t column = 0; column < numCharClasses; column++) { 4430 UnicodeString s; 4431 for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) { 4432 RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r)); 4433 s.append(row->fNextState[column]); 4434 } 4435 columns.push_back(s); 4436 } 4437 // Ignore column (char class) 0 while checking; it's special, and may have duplicates. 4438 for (int c1=1; c1<numCharClasses; c1++) { 4439 for (int c2 = c1+1; c2 < numCharClasses; c2++) { 4440 if (columns.at(c1) == columns.at(c2)) { 4441 errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2); 4442 goto out; 4443 } 4444 } 4445 } 4446 out: 4447 4448 // Check for duplicate states 4449 std::vector<UnicodeString> rows; 4450 for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) { 4451 UnicodeString s; 4452 RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r)); 4453 assertTrue(WHERE, row->fAccepting >= -1); 4454 s.append(row->fAccepting + 1); // values of -1 are expected. 4455 s.append(row->fLookAhead); 4456 s.append(row->fTagIdx); 4457 for (int32_t column = 0; column < numCharClasses; column++) { 4458 s.append(row->fNextState[column]); 4459 } 4460 rows.push_back(s); 4461 } 4462 for (int r1=0; r1 < (int32_t)fwtbl->fNumStates; r1++) { 4463 for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) { 4464 if (rows.at(r1) == rows.at(r2)) { 4465 errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__, __LINE__, r1, r2); 4466 return; 4467 } 4468 } 4469 } 4470 } 4471 4472 // Bug 13447: verify that getRuleStatus() returns the value corresponding to current(), 4473 // even after next() has returned DONE. 4474 4475 void RBBITest::TestBug13447() { 4476 UErrorCode status = U_ZERO_ERROR; 4477 LocalPointer<RuleBasedBreakIterator> bi( 4478 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status)); 4479 assertSuccess(WHERE, status); 4480 if (U_FAILURE(status)) return; 4481 UnicodeString data(u"1234"); 4482 bi->setText(data); 4483 assertEquals(WHERE, UBRK_WORD_NONE, bi->getRuleStatus()); 4484 assertEquals(WHERE, 4, bi->next()); 4485 assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus()); 4486 assertEquals(WHERE, UBRK_DONE, bi->next()); 4487 assertEquals(WHERE, 4, bi->current()); 4488 assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus()); 4489 } 4490 4491 // TestReverse exercises both the synthesized safe reverse rules and the logic 4492 // for filling the break iterator cache when starting from random positions 4493 // in the text. 4494 // 4495 // It's a monkey test, working on random data, with the expected data obtained 4496 // from forward iteration (no safe rules involved), comparing with results 4497 // when indexing into the interior of the string (safe rules needed). 4498 4499 void RBBITest::TestReverse() { 4500 UErrorCode status = U_ZERO_ERROR; 4501 4502 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *) 4503 BreakIterator::createCharacterInstance(Locale::getEnglish(), status))); 4504 assertSuccess(WHERE, status, true); 4505 status = U_ZERO_ERROR; 4506 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *) 4507 BreakIterator::createWordInstance(Locale::getEnglish(), status))); 4508 assertSuccess(WHERE, status, true); 4509 status = U_ZERO_ERROR; 4510 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *) 4511 BreakIterator::createLineInstance(Locale::getEnglish(), status))); 4512 assertSuccess(WHERE, status, true); 4513 status = U_ZERO_ERROR; 4514 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *) 4515 BreakIterator::createSentenceInstance(Locale::getEnglish(), status))); 4516 assertSuccess(WHERE, status, true); 4517 } 4518 4519 void RBBITest::TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi) { 4520 if (!bi) { 4521 return; 4522 } 4523 4524 // From the mapping trie in the break iterator's internal data, create a 4525 // vector of UnicodeStrings, one for each character category, containing 4526 // all of the code points that map to that category. Unicode planes 0 and 1 only, 4527 // to avoid an execess of unassigned code points. 4528 4529 RBBIDataWrapper *data = bi->fData; 4530 int32_t categoryCount = data->fHeader->fCatCount; 4531 UTrie2 *trie = data->fTrie; 4532 4533 std::vector<UnicodeString> strings(categoryCount, UnicodeString()); 4534 for (int cp=0; cp<0x1fff0; ++cp) { 4535 int cat = utrie2_get32(trie, cp); 4536 cat &= ~0x4000; // And off the dictionary bit from the category. 4537 assertTrue(WHERE, cat < categoryCount && cat >= 0); 4538 if (cat < 0 || cat >= categoryCount) return; 4539 strings[cat].append(cp); 4540 } 4541 4542 icu_rand randomGen; 4543 const int testStringLength = 10000; 4544 UnicodeString testString; 4545 4546 for (int i=0; i<testStringLength; ++i) { 4547 int charClass = randomGen() % categoryCount; 4548 if (strings[charClass].length() > 0) { 4549 int cp = strings[charClass].char32At(randomGen() % strings[charClass].length()); 4550 testString.append(cp); 4551 } 4552 } 4553 4554 typedef std::pair<UBool, int32_t> Result; 4555 std::vector<Result> expectedResults; 4556 bi->setText(testString); 4557 for (int i=0; i<testString.length(); ++i) { 4558 bool isboundary = bi->isBoundary(i); 4559 int ruleStatus = bi->getRuleStatus(); 4560 expectedResults.push_back(std::make_pair(isboundary, ruleStatus)); 4561 } 4562 4563 for (int i=testString.length()-1; i>=0; --i) { 4564 bi->setText(testString); // clears the internal break cache 4565 Result expected = expectedResults[i]; 4566 assertEquals(WHERE, expected.first, bi->isBoundary(i)); 4567 assertEquals(WHERE, expected.second, bi->getRuleStatus()); 4568 } 4569 } 4570 4571 4572 // Ticket 13692 - finding word boundaries in very large numbers or words could 4573 // be very time consuming. When the problem was present, this void test 4574 // would run more than fifteen minutes, which is to say, the failure was noticeale. 4575 4576 void RBBITest::TestBug13692() { 4577 UErrorCode status = U_ZERO_ERROR; 4578 LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *) 4579 BreakIterator::createWordInstance(Locale::getEnglish(), status), status); 4580 if (!assertSuccess(WHERE, status, true)) { 4581 return; 4582 } 4583 constexpr int32_t LENGTH = 1000000; 4584 UnicodeString longNumber(LENGTH, (UChar32)u'3', LENGTH); 4585 for (int i=0; i<20; i+=2) { 4586 longNumber.setCharAt(i, u' '); 4587 } 4588 bi->setText(longNumber); 4589 assertFalse(WHERE, bi->isBoundary(LENGTH-5)); 4590 assertSuccess(WHERE, status); 4591 } 4592 4593 // 4594 // TestDebug - A place-holder test for debugging purposes. 4595 // For putting in fragments of other tests that can be invoked 4596 // for tracing without a lot of unwanted extra stuff happening. 4597 // 4598 void RBBITest::TestDebug(void) { 4599 UErrorCode status = U_ZERO_ERROR; 4600 LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *) 4601 BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status); 4602 if (!assertSuccess(WHERE, status, true)) { 4603 return; 4604 } 4605 const UnicodeString &rules = bi->getRules(); 4606 UParseError pe; 4607 LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status)); 4608 assertSuccess(WHERE, status); 4609 } 4610 4611 void RBBITest::TestProperties() { 4612 UErrorCode errorCode = U_ZERO_ERROR; 4613 UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode); 4614 if (!prependSet.isEmpty()) { 4615 errln( 4616 "[:GCB=Prepend:] is not empty any more. " 4617 "Uncomment relevant lines in source/data/brkitr/char.txt and " 4618 "change this test to the opposite condition."); 4619 } 4620 } 4621 4622 #endif // #if !UCONFIG_NO_BREAK_ITERATION 4623