1 /******************************************************************** 2 * COPYRIGHT: 3 * Copyright (c) 1999-2009, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ********************************************************************/ 6 /************************************************************************ 7 * Date Name Description 8 * 12/15/99 Madhu Creation. 9 * 01/12/2000 Madhu Updated for changed API and added new tests 10 ************************************************************************/ 11 12 #include "unicode/utypes.h" 13 14 #if !UCONFIG_NO_BREAK_ITERATION 15 16 #include "unicode/utypes.h" 17 #include "unicode/brkiter.h" 18 #include "unicode/rbbi.h" 19 #include "unicode/uchar.h" 20 #include "unicode/utf16.h" 21 #include "unicode/ucnv.h" 22 #include "unicode/schriter.h" 23 #include "unicode/uniset.h" 24 #include "unicode/regex.h" // TODO: make conditional on regexp being built. 25 #include "unicode/ustring.h" 26 #include "unicode/utext.h" 27 #include "intltest.h" 28 #include "rbbitst.h" 29 #include <string.h> 30 #include "uvector.h" 31 #include "uvectr32.h" 32 #include "triedict.h" 33 #include <string.h> 34 #include <stdio.h> 35 #include <stdlib.h> 36 37 #define TEST_ASSERT(x) {if (!(x)) { \ 38 errln("Failure in file %s, line %d", __FILE__, __LINE__);}} 39 40 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \ 41 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}} 42 43 44 //--------------------------------------------- 45 // runIndexedTest 46 //--------------------------------------------- 47 48 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params ) 49 { 50 if (exec) logln("TestSuite RuleBasedBreakIterator: "); 51 52 switch (index) { 53 #if !UCONFIG_NO_FILE_IO 54 case 0: name = "TestBug4153072"; 55 if(exec) TestBug4153072(); break; 56 #else 57 case 0: name = "skip"; 58 break; 59 #endif 60 61 case 1: name = "TestJapaneseLineBreak"; 62 if(exec) TestJapaneseLineBreak(); break; 63 case 2: name = "TestStatusReturn"; 64 if(exec) TestStatusReturn(); break; 65 66 #if !UCONFIG_NO_FILE_IO 67 case 3: name = "TestUnicodeFiles"; 68 if(exec) TestUnicodeFiles(); break; 69 case 4: name = "TestEmptyString"; 70 if(exec) TestEmptyString(); break; 71 #else 72 case 3: case 4: name = "skip"; 73 break; 74 #endif 75 76 case 5: name = "TestGetAvailableLocales"; 77 if(exec) TestGetAvailableLocales(); break; 78 79 case 6: name = "TestGetDisplayName"; 80 if(exec) TestGetDisplayName(); break; 81 82 #if !UCONFIG_NO_FILE_IO 83 case 7: name = "TestEndBehaviour"; 84 if(exec) TestEndBehaviour(); break; 85 case 8: name = "TestMixedThaiLineBreak"; 86 if(exec) TestMixedThaiLineBreak(); break; 87 case 9: name = "TestThaiLineBreak"; 88 if(exec) TestThaiLineBreak(); break; 89 case 10: name = "TestMaiyamok"; 90 if(exec) TestMaiyamok(); break; 91 case 11: name = "TestWordBreaks"; 92 if(exec) TestWordBreaks(); break; 93 case 12: name = "TestWordBoundary"; 94 if(exec) TestWordBoundary(); break; 95 case 13: name = "TestLineBreaks"; 96 if(exec) TestLineBreaks(); break; 97 case 14: name = "TestSentBreaks"; 98 if(exec) TestSentBreaks(); break; 99 case 15: name = "TestExtended"; 100 if(exec) TestExtended(); break; 101 #else 102 case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip"; 103 break; 104 #endif 105 106 case 16: 107 if(exec) { 108 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO 109 name = "TestMonkey"; 110 TestMonkey(params); 111 #else 112 name = "skip"; 113 #endif 114 } 115 break; 116 117 #if !UCONFIG_NO_FILE_IO 118 case 17: name = "TestBug3818"; 119 if(exec) TestBug3818(); break; 120 case 18: name = "TestJapaneseWordBreak"; 121 if(exec) TestJapaneseWordBreak(); break; 122 #else 123 case 17: case 18: name = "skip"; 124 break; 125 #endif 126 127 case 19: name = "TestDebug"; 128 if(exec) TestDebug(); break; 129 case 20: name = "TestTrieDict"; 130 if(exec) TestTrieDict(); break; 131 132 #if !UCONFIG_NO_FILE_IO 133 case 21: name = "TestBug5775"; 134 if (exec) TestBug5775(); break; 135 case 22: name = "TestThaiBreaks"; 136 if (exec) TestThaiBreaks(); break; 137 case 23: name = "TestTailoredBreaks"; 138 if (exec) TestTailoredBreaks(); break; 139 #else 140 case 21: case 22: case 23: name = "skip"; 141 break; 142 #endif 143 case 24: name = "TestDictRules"; 144 if (exec) TestDictRules(); break; 145 146 default: name = ""; break; //needed to end loop 147 } 148 } 149 150 151 //--------------------------------------------------------------------------- 152 // 153 // class BITestData Holds a set of Break iterator test data and results 154 // Includes 155 // - the string data to be broken 156 // - a vector of the expected break positions. 157 // - a vector of source line numbers for the data, 158 // (to help see where errors occured.) 159 // - The expected break tag values. 160 // - Vectors of actual break positions and tag values. 161 // - Functions for comparing actual with expected and 162 // reporting errors. 163 // 164 //---------------------------------------------------------------------------- 165 class BITestData { 166 public: 167 UnicodeString fDataToBreak; 168 UVector fExpectedBreakPositions; 169 UVector fExpectedTags; 170 UVector fLineNum; 171 UVector fActualBreakPositions; // Test Results. 172 UVector fActualTags; 173 174 BITestData(UErrorCode &status); 175 void addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status); 176 void checkResults(const char *heading, RBBITest *test); 177 void err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx); 178 void clearResults(); 179 }; 180 181 // 182 // Constructor. 183 // 184 BITestData::BITestData(UErrorCode &status) 185 : fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fActualBreakPositions(status), 186 fActualTags(status) 187 { 188 } 189 190 // 191 // addDataChunk. Add a section (non-breaking) piece if data to the test data. 192 // The macro form collects the line number, which is helpful 193 // when tracking down failures. 194 // 195 // A null data item is inserted at the start of each test's data 196 // to put the starting zero into the data list. The position saved for 197 // each non-null item is its ending position. 198 // 199 #define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status); 200 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) { 201 if (U_FAILURE(status)) {return;} 202 if (data != NULL) { 203 fDataToBreak.append(CharsToUnicodeString(data)); 204 } 205 fExpectedBreakPositions.addElement(fDataToBreak.length(), status); 206 fExpectedTags.addElement(tag, status); 207 fLineNum.addElement(lineNum, status); 208 } 209 210 211 // 212 // checkResults. Compare the actual and expected break positions, report any differences. 213 // 214 void BITestData::checkResults(const char *heading, RBBITest *test) { 215 int32_t expectedIndex = 0; 216 int32_t actualIndex = 0; 217 218 for (;;) { 219 // If we've run through both the expected and actual results vectors, we're done. 220 // break out of the loop. 221 if (expectedIndex >= fExpectedBreakPositions.size() && 222 actualIndex >= fActualBreakPositions.size()) { 223 break; 224 } 225 226 227 if (expectedIndex >= fExpectedBreakPositions.size()) { 228 err(heading, test, expectedIndex-1, actualIndex); 229 actualIndex++; 230 continue; 231 } 232 233 if (actualIndex >= fActualBreakPositions.size()) { 234 err(heading, test, expectedIndex, actualIndex-1); 235 expectedIndex++; 236 continue; 237 } 238 239 if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) { 240 err(heading, test, expectedIndex, actualIndex); 241 // Try to resync the positions of the indices, to avoid a rash of spurious erros. 242 if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) { 243 actualIndex++; 244 } else { 245 expectedIndex++; 246 } 247 continue; 248 } 249 250 if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) { 251 test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d", 252 heading, fLineNum.elementAt(expectedIndex), 253 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex)); 254 } 255 256 actualIndex++; 257 expectedIndex++; 258 } 259 } 260 261 // 262 // err - An error was found. Report it, along with information about where the 263 // incorrectly broken test data appeared in the source file. 264 // 265 void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx) 266 { 267 int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx); 268 int32_t actual = fActualBreakPositions.elementAti(actualIdx); 269 int32_t o = 0; 270 int32_t line = fLineNum.elementAti(expectedIdx); 271 if (expectedIdx > 0) { 272 // The line numbers are off by one because a premature break occurs somewhere 273 // within the previous item, rather than at the start of the current (expected) item. 274 // We want to report the offset of the unexpected break from the start of 275 // this previous item. 276 o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1); 277 } 278 if (actual < expected) { 279 test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading, o, line, actual, expected); 280 } else { 281 test->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading, line, actual, expected); 282 } 283 } 284 285 286 void BITestData::clearResults() { 287 fActualBreakPositions.removeAllElements(); 288 fActualTags.removeAllElements(); 289 } 290 291 292 //----------------------------------------------------------------------------------- 293 // 294 // Cannned Test Characters 295 // 296 //----------------------------------------------------------------------------------- 297 298 static const UChar cannedTestArray[] = { 299 0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031, 300 0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b, 301 0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2, 302 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3, 303 0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303, 304 0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000, 305 0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f, 306 0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000 307 }; 308 309 static UnicodeString* cannedTestChars = 0; 310 311 #define halfNA "\\u0928\\u094d\\u200d" 312 #define halfSA "\\u0938\\u094d\\u200d" 313 #define halfCHA "\\u091a\\u094d\\u200d" 314 #define halfKA "\\u0915\\u094d\\u200d" 315 #define deadTA "\\u0924\\u094d" 316 317 //-------------------------------------------------------------------------------------- 318 // 319 // RBBITest constructor and destructor 320 // 321 //-------------------------------------------------------------------------------------- 322 323 RBBITest::RBBITest() { 324 UnicodeString temp(cannedTestArray); 325 cannedTestChars = new UnicodeString(); 326 *cannedTestChars += (UChar)0x0000; 327 *cannedTestChars += temp; 328 } 329 330 331 RBBITest::~RBBITest() { 332 delete cannedTestChars; 333 } 334 335 336 static const int T_NUMBER = 100; 337 static const int T_LETTER = 200; 338 static const int T_H_OR_K = 300; 339 static const int T_IDEO = 400; 340 341 342 343 344 345 346 //-------------------------------------------------------------------- 347 //Testing the BreakIterator for devanagari script 348 //-------------------------------------------------------------------- 349 350 #define deadRA "\\u0930\\u094d" /*deadform RA = devanagari RA + virama*/ 351 #define deadPHA "\\u092b\\u094d" /*deadform PHA = devanagari PHA + virama*/ 352 #define deadTTHA "\\u0920\\u094d" 353 #define deadPA "\\u092a\\u094d" 354 #define deadSA "\\u0938\\u094d" 355 #define visarga "\\u0903" /*devanagari visarga looks like a english colon*/ 356 357 358 359 360 361 362 //----------------------------------------------------------------------------------- 363 // 364 // Test for status {tag} return value from break rules. 365 // TODO: a more thorough test. 366 // 367 //----------------------------------------------------------------------------------- 368 void RBBITest::TestStatusReturn() { 369 UnicodeString rulesString1("$Letters = [:L:];\n" 370 "$Numbers = [:N:];\n" 371 "$Letters+{1};\n" 372 "$Numbers+{2};\n" 373 "Help\\ {4}/me\\!;\n" 374 "[^$Letters $Numbers];\n" 375 "!.*;\n", -1, US_INV); 376 UnicodeString testString1 = "abc123..abc Help me Help me!"; 377 // 01234567890123456789012345678 378 int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1}; 379 int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1}; 380 381 UErrorCode status=U_ZERO_ERROR; 382 UParseError parseError; 383 384 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status); 385 if(U_FAILURE(status)) { 386 dataerrln("FAIL : in construction - %s", u_errorName(status)); 387 } else { 388 int32_t pos; 389 int32_t i = 0; 390 bi->setText(testString1); 391 for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) { 392 if (pos != bounds1[i]) { 393 errln("FAIL: expected break at %d, got %d\n", bounds1[i], pos); 394 break; 395 } 396 397 int tag = bi->getRuleStatus(); 398 if (tag != brkStatus[i]) { 399 errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag); 400 break; 401 } 402 i++; 403 } 404 } 405 delete bi; 406 } 407 408 409 static void printStringBreaks(UnicodeString ustr, int expected[], 410 int expectedcount) 411 { 412 UErrorCode status = U_ZERO_ERROR; 413 char name[100]; 414 printf("code alpha extend alphanum type word sent line name\n"); 415 int j; 416 for (j = 0; j < ustr.length(); j ++) { 417 if (expectedcount > 0) { 418 int k; 419 for (k = 0; k < expectedcount; k ++) { 420 if (j == expected[k]) { 421 printf("------------------------------------------------ %d\n", 422 j); 423 } 424 } 425 } 426 UChar32 c = ustr.char32At(j); 427 if (c > 0xffff) { 428 j ++; 429 } 430 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status); 431 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c, 432 u_isUAlphabetic(c), 433 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND), 434 u_isalnum(c), 435 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY, 436 u_charType(c), 437 U_SHORT_PROPERTY_NAME), 438 u_getPropertyValueName(UCHAR_WORD_BREAK, 439 u_getIntPropertyValue(c, 440 UCHAR_WORD_BREAK), 441 U_SHORT_PROPERTY_NAME), 442 u_getPropertyValueName(UCHAR_SENTENCE_BREAK, 443 u_getIntPropertyValue(c, 444 UCHAR_SENTENCE_BREAK), 445 U_SHORT_PROPERTY_NAME), 446 u_getPropertyValueName(UCHAR_LINE_BREAK, 447 u_getIntPropertyValue(c, 448 UCHAR_LINE_BREAK), 449 U_SHORT_PROPERTY_NAME), 450 name); 451 } 452 } 453 454 void RBBITest::TestThaiLineBreak() { 455 UErrorCode status = U_ZERO_ERROR; 456 BITestData thaiLineSelection(status); 457 458 // \u0e2f-- the Thai paiyannoi character-- isn't a letter. It's a symbol that 459 // represents elided letters at the end of a long word. It should be bound to 460 // the end of the word and not treated as an independent punctuation mark. 461 462 463 ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data 464 ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f", 0, status); 465 ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e30", 0, status); 466 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status); 467 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status); 468 // ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32", 0, status); 469 // ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status); 470 ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48", 0, status); 471 // the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us 472 ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e2d\\u0e01", 0, status); 473 ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32", 0, status); 474 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status); 475 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, status); 476 ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, status); 477 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status); 478 479 // the one time where the paiyannoi occurs somewhere other than at the end 480 // of a word is in the Thai abbrevation for "etc.", which both begins and 481 // ends with a paiyannoi 482 ADD_DATACHUNK(thaiLineSelection, "\\u0e2f\\u0e25\\u0e2f", 0, status); 483 ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status); 484 ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status); 485 486 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance( 487 Locale("th"), status); 488 if (U_FAILURE(status)) 489 { 490 errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestThaiLineBreak. - %s", u_errorName(status)); 491 return; 492 } 493 494 generalIteratorTest(*e, thaiLineSelection); 495 delete e; 496 } 497 498 499 500 void RBBITest::TestMixedThaiLineBreak() 501 { 502 UErrorCode status = U_ZERO_ERROR; 503 BITestData thaiLineSelection(status); 504 505 ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data 506 507 508 // @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters 509 // start 510 511 ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status); 512 ADD_DATACHUNK(thaiLineSelection, "\\u0E1E\\u0E38\\u0E17\\u0E18\\u0E28\\u0E31\\u0E01\\u0E23\\u0E32\\u0E0A ", 0, status); 513 ADD_DATACHUNK(thaiLineSelection, "2545 ", 0, status); 514 ADD_DATACHUNK(thaiLineSelection, "\\u0E40\\u0E1B\\u0E47\\u0E19", 0, status); 515 ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status); 516 ADD_DATACHUNK(thaiLineSelection, "\\u0E09\\u0E25\\u0E2D\\u0E07", 0, status); 517 ADD_DATACHUNK(thaiLineSelection, "\\u0E04\\u0E23\\u0E1A", 0, status); 518 ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E2D\\u0E1A ", 0, status); 519 ADD_DATACHUNK(thaiLineSelection, "\"\\u0E52\\u0E52\\u0E50 ", 0, status); 520 ADD_DATACHUNK(thaiLineSelection, "\\u0E1b\\u0E35\" ", 0, status); 521 ADD_DATACHUNK(thaiLineSelection, "\\u0E02\\u0E2d\\u0E07", 0, status); 522 ADD_DATACHUNK(thaiLineSelection, "\\u0E01\\u0E23\\u0E38\\u0E07", 0, status); 523 ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E31\\u0E15\\u0E19\\u0E42\\u0E01\\u0E2A\\u0E34\\u0E19\\u0E17\\u0E23\\u0E4C ", 0, status); 524 ADD_DATACHUNK(thaiLineSelection, "(\\u0E01\\u0E23\\u0E38\\u0E07\\u0E40\\u0E17\\u0E1e\\u0E2F", 0, status); 525 ADD_DATACHUNK(thaiLineSelection, "\\u0E2B\\u0E23\\u0E37\\u0E2D ", 0, status); 526 ADD_DATACHUNK(thaiLineSelection, "Bangkok)", 0, status); 527 528 // @suwit - end of changes 529 530 531 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale("th"), status); 532 if (U_FAILURE(status)) 533 { 534 errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak. - %s", u_errorName(status)); 535 return; 536 } 537 538 539 generalIteratorTest(*e, thaiLineSelection); 540 delete e; 541 } 542 543 544 void RBBITest::TestMaiyamok() 545 { 546 UErrorCode status = U_ZERO_ERROR; 547 BITestData thaiLineSelection(status); 548 ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data 549 // the Thai maiyamok character is a shorthand symbol that means "repeat the previous 550 // word". Instead of appearing as a word unto itself, however, it's kept together 551 // with the word before it 552 ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e1b\\u0e46", 0, status); 553 ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32\\u0e46", 0, status); 554 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status); 555 ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07", 0, status); 556 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e1e", 0, status); 557 ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e25\\u0e30", 0, status); 558 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35", 0, status); 559 ADD_DATACHUNK(thaiLineSelection, "\\u0e22\\u0e07", 0, status); 560 ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status); 561 562 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance( 563 Locale("th"), status); 564 565 if (U_FAILURE(status)) 566 { 567 errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMaiyamok. - %s", u_errorName(status)); 568 return; 569 } 570 generalIteratorTest(*e, thaiLineSelection); 571 delete e; 572 } 573 574 575 576 void RBBITest::TestBug3818() { 577 UErrorCode status = U_ZERO_ERROR; 578 579 // Four Thai words... 580 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 581 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 }; 582 UnicodeString thaiStr(thaiWordData); 583 584 RuleBasedBreakIterator* bi = 585 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status); 586 if (U_FAILURE(status) || bi == NULL) { 587 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status)); 588 return; 589 } 590 bi->setText(thaiStr); 591 592 int32_t startOfSecondWord = bi->following(1); 593 if (startOfSecondWord != 4) { 594 errln("Fail at file %s, line %d expected start of word at 4, got %d", 595 __FILE__, __LINE__, startOfSecondWord); 596 } 597 startOfSecondWord = bi->following(0); 598 if (startOfSecondWord != 4) { 599 errln("Fail at file %s, line %d expected start of word at 4, got %d", 600 __FILE__, __LINE__, startOfSecondWord); 601 } 602 delete bi; 603 } 604 605 606 void RBBITest::TestJapaneseWordBreak() { 607 UErrorCode status = U_ZERO_ERROR; 608 BITestData japaneseWordSelection(status); 609 610 ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status); // Break at start of data 611 ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2 612 ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5 613 ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7 614 ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10 615 ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11 616 ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12 617 618 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance( 619 Locale("ja"), status); 620 if (U_FAILURE(status)) 621 { 622 errcheckln(status, "Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n"); 623 return; 624 } 625 626 generalIteratorTest(*e, japaneseWordSelection); 627 delete e; 628 } 629 630 void RBBITest::TestTrieDict() { 631 UErrorCode status = U_ZERO_ERROR; 632 633 // 634 // Open and read the test data file. 635 // 636 const char *testDataDirectory = IntlTest::getSourceTestData(status); 637 char testFileName[1000]; 638 if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen("riwords.txt") + 10 >= sizeof(testFileName)) { 639 errln("Can't open test data. Path too long."); 640 return; 641 } 642 strcpy(testFileName, testDataDirectory); 643 strcat(testFileName, "riwords.txt"); 644 645 // Items needing deleting at the end 646 MutableTrieDictionary *mutableDict = NULL; 647 CompactTrieDictionary *compactDict = NULL; 648 UnicodeSet *breaks = NULL; 649 UChar *testFile = NULL; 650 StringEnumeration *enumer1 = NULL; 651 StringEnumeration *enumer2 = NULL; 652 MutableTrieDictionary *mutable2 = NULL; 653 StringEnumeration *cloneEnum = NULL; 654 CompactTrieDictionary *compact2 = NULL; 655 656 657 const UnicodeString *originalWord = NULL; 658 const UnicodeString *cloneWord = NULL; 659 UChar *current; 660 UChar *word; 661 UChar uc; 662 int32_t wordLen; 663 int32_t wordCount; 664 int32_t testCount; 665 666 int len; 667 testFile = ReadAndConvertFile(testFileName, len, NULL, status); 668 if (U_FAILURE(status)) { 669 goto cleanup; /* something went wrong, error already output */ 670 } 671 672 mutableDict = new MutableTrieDictionary(0x0E1C, status); 673 if (U_FAILURE(status)) { 674 errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status)); 675 goto cleanup; 676 } 677 678 breaks = new UnicodeSet; 679 breaks->add(0x000A); // Line Feed 680 breaks->add(0x000D); // Carriage Return 681 breaks->add(0x2028); // Line Separator 682 breaks->add(0x2029); // Paragraph Separator 683 684 // Now add each non-comment line of the file as a word. 685 current = testFile; 686 word = current; 687 uc = *current++; 688 wordLen = 0; 689 wordCount = 0; 690 691 while (uc) { 692 if (uc == 0x0023) { // #comment line, skip 693 while (uc && !breaks->contains(uc)) { 694 uc = *current++; 695 } 696 } 697 else while (uc && !breaks->contains(uc)) { 698 ++wordLen; 699 uc = *current++; 700 } 701 if (wordLen > 0) { 702 mutableDict->addWord(word, wordLen, status); 703 if (U_FAILURE(status)) { 704 errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status)); 705 goto cleanup; 706 } 707 wordCount += 1; 708 } 709 710 // Find beginning of next line 711 while (uc && breaks->contains(uc)) { 712 uc = *current++; 713 } 714 word = current-1; 715 wordLen = 0; 716 } 717 718 if (wordCount < 50) { 719 errln("Word count (%d) unreasonably small\n", wordCount); 720 goto cleanup; 721 } 722 723 enumer1 = mutableDict->openWords(status); 724 if (U_FAILURE(status)) { 725 errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status)); 726 goto cleanup; 727 } 728 729 testCount = 0; 730 if (wordCount != (testCount = enumer1->count(status))) { 731 errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n", 732 testCount, wordCount, u_errorName(status)); 733 goto cleanup; 734 } 735 736 // Now compact it 737 compactDict = new CompactTrieDictionary(*mutableDict, status); 738 if (U_FAILURE(status)) { 739 errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status)); 740 goto cleanup; 741 } 742 743 enumer2 = compactDict->openWords(status); 744 if (U_FAILURE(status)) { 745 errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status)); 746 goto cleanup; 747 } 748 749 if (wordCount != (testCount = enumer2->count(status))) { 750 errln("CompactTrieDictionary word count (%d) differs from file word count (%d), with status %s\n", 751 testCount, wordCount, u_errorName(status)); 752 goto cleanup; 753 } 754 755 if (enumer1->getDynamicClassID() == enumer2->getDynamicClassID()) { 756 errln("CompactTrieEnumeration and MutableTrieEnumeration ClassIDs are the same"); 757 } 758 delete enumer1; 759 enumer1 = NULL; 760 delete enumer2; 761 enumer2 = NULL; 762 763 // Now un-compact it 764 mutable2 = compactDict->cloneMutable(status); 765 if (U_FAILURE(status)) { 766 errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status)); 767 goto cleanup; 768 } 769 770 cloneEnum = mutable2->openWords(status); 771 if (U_FAILURE(status)) { 772 errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status)); 773 goto cleanup; 774 } 775 776 if (wordCount != (testCount = cloneEnum->count(status))) { 777 errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n", 778 testCount, wordCount, u_errorName(status)); 779 goto cleanup; 780 } 781 782 // Compact original dictionary to clone. Note that we can only compare the same kind of 783 // dictionary as the order of the enumerators is not guaranteed to be the same between 784 // different kinds 785 enumer1 = mutableDict->openWords(status); 786 if (U_FAILURE(status)) { 787 errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status)); 788 goto cleanup; 789 } 790 791 originalWord = enumer1->snext(status); 792 cloneWord = cloneEnum->snext(status); 793 while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) { 794 if (*originalWord != *cloneWord) { 795 errln("Original and cloned MutableTrieDictionary word mismatch\n"); 796 goto cleanup; 797 } 798 originalWord = enumer1->snext(status); 799 cloneWord = cloneEnum->snext(status); 800 } 801 802 if (U_FAILURE(status)) { 803 errln("Enumeration failed: %s\n", u_errorName(status)); 804 goto cleanup; 805 } 806 807 if (originalWord != cloneWord) { 808 errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n"); 809 goto cleanup; 810 } 811 812 // Test the data copying constructor for CompactTrieDict, and the data access APIs. 813 compact2 = new CompactTrieDictionary(compactDict->data(), status); 814 if (U_FAILURE(status)) { 815 errln("CompactTrieDictionary(const void *,...) failed\n"); 816 goto cleanup; 817 } 818 819 if (compact2->dataSize() == 0) { 820 errln("CompactTrieDictionary->dataSize() == 0\n"); 821 goto cleanup; 822 } 823 824 // Now count the words via the second dictionary 825 delete enumer1; 826 enumer1 = compact2->openWords(status); 827 if (U_FAILURE(status)) { 828 errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status)); 829 goto cleanup; 830 } 831 832 if (wordCount != (testCount = enumer1->count(status))) { 833 errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n", 834 testCount, wordCount, u_errorName(status)); 835 goto cleanup; 836 } 837 838 cleanup: 839 delete compactDict; 840 delete mutableDict; 841 delete breaks; 842 delete[] testFile; 843 delete enumer1; 844 delete mutable2; 845 delete cloneEnum; 846 delete compact2; 847 } 848 849 850 //---------------------------------------------------------------------------- 851 // 852 // generalIteratorTest Given a break iterator and a set of test data, 853 // Run the tests and report the results. 854 // 855 //---------------------------------------------------------------------------- 856 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td) 857 { 858 859 bi.setText(td.fDataToBreak); 860 861 testFirstAndNext(bi, td); 862 863 testLastAndPrevious(bi, td); 864 865 testFollowing(bi, td); 866 testPreceding(bi, td); 867 testIsBoundary(bi, td); 868 doMultipleSelectionTest(bi, td); 869 } 870 871 872 // 873 // testFirstAndNext. Run the iterator forwards in the obvious first(), next() 874 // kind of loop. 875 // 876 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td) 877 { 878 UErrorCode status = U_ZERO_ERROR; 879 int32_t p; 880 int32_t lastP = -1; 881 int32_t tag; 882 883 logln("Test first and next"); 884 bi.setText(td.fDataToBreak); 885 td.clearResults(); 886 887 for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) { 888 td.fActualBreakPositions.addElement(p, status); // Save result. 889 tag = bi.getRuleStatus(); 890 td.fActualTags.addElement(tag, status); 891 if (p <= lastP) { 892 // If the iterator is not making forward progress, stop. 893 // No need to raise an error here, it'll be detected in the normal check of results. 894 break; 895 } 896 lastP = p; 897 } 898 td.checkResults("testFirstAndNext", this); 899 } 900 901 902 // 903 // TestLastAndPrevious. Run the iterator backwards, starting with last(). 904 // 905 void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td) 906 { 907 UErrorCode status = U_ZERO_ERROR; 908 int32_t p; 909 int32_t lastP = 0x7ffffffe; 910 int32_t tag; 911 912 logln("Test last and previous"); 913 bi.setText(td.fDataToBreak); 914 td.clearResults(); 915 916 for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) { 917 // Save break position. Insert it at start of vector of results, shoving 918 // already-saved results further towards the end. 919 td.fActualBreakPositions.insertElementAt(p, 0, status); 920 // bi.previous(); // TODO: Why does this fix things up???? 921 // bi.next(); 922 tag = bi.getRuleStatus(); 923 td.fActualTags.insertElementAt(tag, 0, status); 924 if (p >= lastP) { 925 // If the iterator is not making progress, stop. 926 // No need to raise an error here, it'll be detected in the normal check of results. 927 break; 928 } 929 lastP = p; 930 } 931 td.checkResults("testLastAndPrevious", this); 932 } 933 934 935 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td) 936 { 937 UErrorCode status = U_ZERO_ERROR; 938 int32_t p; 939 int32_t tag; 940 int32_t lastP = -2; // A value that will never be returned as a break position. 941 // cannot be -1; that is returned for DONE. 942 int i; 943 944 logln("testFollowing():"); 945 bi.setText(td.fDataToBreak); 946 td.clearResults(); 947 948 // Save the starting point, since we won't get that out of following. 949 p = bi.first(); 950 td.fActualBreakPositions.addElement(p, status); // Save result. 951 tag = bi.getRuleStatus(); 952 td.fActualTags.addElement(tag, status); 953 954 for (i = 0; i <= td.fDataToBreak.length()+1; i++) { 955 p = bi.following(i); 956 if (p != lastP) { 957 if (p == RuleBasedBreakIterator::DONE) { 958 break; 959 } 960 // We've reached a new break position. Save it. 961 td.fActualBreakPositions.addElement(p, status); // Save result. 962 tag = bi.getRuleStatus(); 963 td.fActualTags.addElement(tag, status); 964 lastP = p; 965 } 966 } 967 // The loop normally exits by means of the break in the middle. 968 // Make sure that the index was at the correct position for the break iterator to have 969 // returned DONE. 970 if (i != td.fDataToBreak.length()) { 971 errln("testFollowing(): iterator returned DONE prematurely."); 972 } 973 974 // Full check of all results. 975 td.checkResults("testFollowing", this); 976 } 977 978 979 980 void RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) { 981 UErrorCode status = U_ZERO_ERROR; 982 int32_t p; 983 int32_t tag; 984 int32_t lastP = 0x7ffffffe; 985 int i; 986 987 logln("testPreceding():"); 988 bi.setText(td.fDataToBreak); 989 td.clearResults(); 990 991 p = bi.last(); 992 td.fActualBreakPositions.addElement(p, status); 993 tag = bi.getRuleStatus(); 994 td.fActualTags.addElement(tag, status); 995 996 for (i = td.fDataToBreak.length(); i>=-1; i--) { 997 p = bi.preceding(i); 998 if (p != lastP) { 999 if (p == RuleBasedBreakIterator::DONE) { 1000 break; 1001 } 1002 // We've reached a new break position. Save it. 1003 td.fActualBreakPositions.insertElementAt(p, 0, status); 1004 lastP = p; 1005 tag = bi.getRuleStatus(); 1006 td.fActualTags.insertElementAt(tag, 0, status); 1007 } 1008 } 1009 // The loop normally exits by means of the break in the middle. 1010 // Make sure that the index was at the correct position for the break iterator to have 1011 // returned DONE. 1012 if (i != 0) { 1013 errln("testPreceding(): iterator returned DONE prematurely."); 1014 } 1015 1016 // Full check of all results. 1017 td.checkResults("testPreceding", this); 1018 } 1019 1020 1021 1022 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) { 1023 UErrorCode status = U_ZERO_ERROR; 1024 int i; 1025 int32_t tag; 1026 1027 logln("testIsBoundary():"); 1028 bi.setText(td.fDataToBreak); 1029 td.clearResults(); 1030 1031 for (i = 0; i <= td.fDataToBreak.length(); i++) { 1032 if (bi.isBoundary(i)) { 1033 td.fActualBreakPositions.addElement(i, status); // Save result. 1034 tag = bi.getRuleStatus(); 1035 td.fActualTags.addElement(tag, status); 1036 } 1037 } 1038 td.checkResults("testIsBoundary: ", this); 1039 } 1040 1041 1042 1043 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td) 1044 { 1045 iterator.setText(td.fDataToBreak); 1046 1047 RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone(); 1048 int32_t offset = iterator.first(); 1049 int32_t testOffset; 1050 int32_t count = 0; 1051 1052 logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length()); 1053 1054 if (*testIterator != iterator) 1055 errln("clone() or operator!= failed: two clones compared unequal"); 1056 1057 do { 1058 testOffset = testIterator->first(); 1059 testOffset = testIterator->next(count); 1060 if (offset != testOffset) 1061 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset); 1062 1063 if (offset != RuleBasedBreakIterator::DONE) { 1064 count++; 1065 offset = iterator.next(); 1066 1067 if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) { 1068 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset); 1069 if (count > 10000 || offset == -1) { 1070 errln("operator== failed too many times. Stopping test."); 1071 if (offset == -1) { 1072 errln("Does (RuleBasedBreakIterator::DONE == -1)?"); 1073 } 1074 return; 1075 } 1076 } 1077 } 1078 } while (offset != RuleBasedBreakIterator::DONE); 1079 1080 // now do it backwards... 1081 offset = iterator.last(); 1082 count = 0; 1083 1084 do { 1085 testOffset = testIterator->last(); 1086 testOffset = testIterator->next(count); // next() with a negative arg is same as previous 1087 if (offset != testOffset) 1088 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset); 1089 1090 if (offset != RuleBasedBreakIterator::DONE) { 1091 count--; 1092 offset = iterator.previous(); 1093 } 1094 } while (offset != RuleBasedBreakIterator::DONE); 1095 1096 delete testIterator; 1097 } 1098 1099 1100 //--------------------------------------------- 1101 // 1102 // other tests 1103 // 1104 //--------------------------------------------- 1105 void RBBITest::TestEmptyString() 1106 { 1107 UnicodeString text = ""; 1108 UErrorCode status = U_ZERO_ERROR; 1109 1110 BITestData x(status); 1111 ADD_DATACHUNK(x, "", 0, status); // Break at start of data 1112 RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); 1113 if (U_FAILURE(status)) 1114 { 1115 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status)); 1116 return; 1117 } 1118 generalIteratorTest(*bi, x); 1119 delete bi; 1120 } 1121 1122 void RBBITest::TestGetAvailableLocales() 1123 { 1124 int32_t locCount = 0; 1125 const Locale* locList = BreakIterator::getAvailableLocales(locCount); 1126 1127 if (locCount == 0) 1128 dataerrln("getAvailableLocales() returned an empty list!"); 1129 // Just make sure that it's returning good memory. 1130 int32_t i; 1131 for (i = 0; i < locCount; ++i) { 1132 logln(locList[i].getName()); 1133 } 1134 } 1135 1136 //Testing the BreakIterator::getDisplayName() function 1137 void RBBITest::TestGetDisplayName() 1138 { 1139 UnicodeString result; 1140 1141 BreakIterator::getDisplayName(Locale::getUS(), result); 1142 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)") 1143 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \"" 1144 + result); 1145 1146 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result); 1147 if (result != "French (France)") 1148 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \"" 1149 + result); 1150 } 1151 /** 1152 * Test End Behaviour 1153 * @bug 4068137 1154 */ 1155 void RBBITest::TestEndBehaviour() 1156 { 1157 UErrorCode status = U_ZERO_ERROR; 1158 UnicodeString testString("boo."); 1159 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status); 1160 if (U_FAILURE(status)) 1161 { 1162 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status)); 1163 return; 1164 } 1165 wb->setText(testString); 1166 1167 if (wb->first() != 0) 1168 errln("Didn't get break at beginning of string."); 1169 if (wb->next() != 3) 1170 errln("Didn't get break before period in \"boo.\""); 1171 if (wb->current() != 4 && wb->next() != 4) 1172 errln("Didn't get break at end of string."); 1173 delete wb; 1174 } 1175 /* 1176 * @bug 4153072 1177 */ 1178 void RBBITest::TestBug4153072() { 1179 UErrorCode status = U_ZERO_ERROR; 1180 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status); 1181 if (U_FAILURE(status)) 1182 { 1183 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status)); 1184 return; 1185 } 1186 UnicodeString str("...Hello, World!..."); 1187 int32_t begin = 3; 1188 int32_t end = str.length() - 3; 1189 UBool onBoundary; 1190 1191 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin); 1192 iter->adoptText(textIterator); 1193 int index; 1194 // Note: with the switch to UText, there is no way to restrict the 1195 // iteration range to begin at an index other than zero. 1196 // String character iterators created with a non-zero bound are 1197 // treated by RBBI as being empty. 1198 for (index = -1; index < begin + 1; ++index) { 1199 onBoundary = iter->isBoundary(index); 1200 if (index == 0? !onBoundary : onBoundary) { 1201 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index + 1202 " and begin index = " + begin); 1203 } 1204 } 1205 delete iter; 1206 } 1207 1208 1209 // 1210 // Test for problem reported by Ashok Matoria on 9 July 2007 1211 // One.<kSoftHyphen><kSpace>Two. 1212 // 1213 // Sentence break at start (0) and then on calling next() it breaks at 1214 // 'T' of "Two". Now, at this point if I do next() and 1215 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two". 1216 // 1217 void RBBITest::TestBug5775() { 1218 UErrorCode status = U_ZERO_ERROR; 1219 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status); 1220 TEST_ASSERT_SUCCESS(status); 1221 if (U_FAILURE(status)) { 1222 return; 1223 } 1224 // Check for status first for better handling of no data errors. 1225 TEST_ASSERT(bi != NULL); 1226 if (bi == NULL) { 1227 return; 1228 } 1229 1230 UnicodeString s("One.\\u00ad Two.", -1, US_INV); 1231 // 01234 56789 1232 s = s.unescape(); 1233 bi->setText(s); 1234 int pos = bi->next(); 1235 TEST_ASSERT(pos == 6); 1236 pos = bi->next(); 1237 TEST_ASSERT(pos == 10); 1238 pos = bi->previous(); 1239 TEST_ASSERT(pos == 6); 1240 delete bi; 1241 } 1242 1243 1244 1245 /** 1246 * Test Japanese Line Break 1247 * @bug 4095322 1248 */ 1249 void RBBITest::TestJapaneseLineBreak() 1250 { 1251 #if 0 1252 // Test needs updating some more... Dump it for now. 1253 1254 1255 // Change for Unicode TR 14: Punctuation characters with categories Pi and Pf do not count 1256 // as opening and closing punctuation for line breaking. 1257 // Also, \u30fc and \u30fe are not counted as hyphens. Remove these chars 1258 // from these tests. 6-13-2002 1259 // 1260 UErrorCode status = U_ZERO_ERROR; 1261 UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c"); 1262 UnicodeString precedingChars = CharsToUnicodeString( 1263 //"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f"); 1264 "([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e"); 1265 UnicodeString followingChars = CharsToUnicodeString( 1266 // ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc" 1267 ")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7" 1268 // ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034" 1269 ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034" 1270 "\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302"); 1271 BreakIterator *iter = BreakIterator::createLineInstance(Locale::getJapan(), status); 1272 1273 int32_t i; 1274 if (U_FAILURE(status)) 1275 { 1276 errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n"); 1277 return; 1278 } 1279 1280 for (i = 0; i < precedingChars.length(); i++) { 1281 testString.setCharAt(1, precedingChars[i]); 1282 iter->setText(testString); 1283 int32_t j = iter->first(); 1284 if (j != 0) 1285 errln("ja line break failure: failed to start at 0"); 1286 j = iter->next(); 1287 if (j != 1) 1288 errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars[i]) 1289 + "' (" + ((int)(precedingChars[i])) + ")"); 1290 j = iter->next(); 1291 if (j != 3) 1292 errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars[i]) 1293 + "' (" + ((int)(precedingChars[i])) + ")"); 1294 } 1295 1296 for (i = 0; i < followingChars.length(); i++) { 1297 testString.setCharAt(1, followingChars[i]); 1298 iter->setText(testString); 1299 int j = iter->first(); 1300 if (j != 0) 1301 errln("ja line break failure: failed to start at 0"); 1302 j = iter->next(); 1303 if (j != 2) 1304 errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars[i]) 1305 + "' (" + ((int)(followingChars[i])) + ")"); 1306 j = iter->next(); 1307 if (j != 3) 1308 errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars[i]) 1309 + "' (" + ((int)(followingChars[i])) + ")"); 1310 } 1311 delete iter; 1312 #endif 1313 } 1314 1315 1316 //------------------------------------------------------------------------------ 1317 // 1318 // RBBITest::Extended Run RBBI Tests from an external test data file 1319 // 1320 //------------------------------------------------------------------------------ 1321 1322 struct TestParams { 1323 BreakIterator *bi; 1324 UnicodeString dataToBreak; 1325 UVector32 *expectedBreaks; 1326 UVector32 *srcLine; 1327 UVector32 *srcCol; 1328 }; 1329 1330 void RBBITest::executeTest(TestParams *t) { 1331 int32_t bp; 1332 int32_t prevBP; 1333 int32_t i; 1334 1335 if (t->bi == NULL) { 1336 return; 1337 } 1338 1339 t->bi->setText(t->dataToBreak); 1340 // 1341 // Run the iterator forward 1342 // 1343 prevBP = -1; 1344 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) { 1345 if (prevBP == bp) { 1346 // Fail for lack of forward progress. 1347 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d", 1348 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 1349 break; 1350 } 1351 1352 // Check that there were we didn't miss an expected break between the last one 1353 // and this one. 1354 for (i=prevBP+1; i<bp; i++) { 1355 if (t->expectedBreaks->elementAti(i) != 0) { 1356 int expected[] = {0, i}; 1357 printStringBreaks(t->dataToBreak, expected, 2); 1358 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1359 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 1360 } 1361 } 1362 1363 // Check that the break we did find was expected 1364 if (t->expectedBreaks->elementAti(bp) == 0) { 1365 int expected[] = {0, bp}; 1366 printStringBreaks(t->dataToBreak, expected, 2); 1367 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d", 1368 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 1369 } else { 1370 // The break was expected. 1371 // Check that the {nnn} tag value is correct. 1372 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp); 1373 if (expectedTagVal == -1) { 1374 expectedTagVal = 0; 1375 } 1376 int32_t line = t->srcLine->elementAti(bp); 1377 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus(); 1378 if (rs != expectedTagVal) { 1379 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n" 1380 " Actual, Expected status = %4d, %4d", 1381 bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal); 1382 } 1383 } 1384 1385 1386 prevBP = bp; 1387 } 1388 1389 // Verify that there were no missed expected breaks after the last one found 1390 for (i=prevBP+1; i<t->expectedBreaks->size(); i++) { 1391 if (t->expectedBreaks->elementAti(i) != 0) { 1392 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1393 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 1394 } 1395 } 1396 1397 // 1398 // Run the iterator backwards, verify that the same breaks are found. 1399 // 1400 prevBP = t->dataToBreak.length()+2; // start with a phony value for the last break pos seen. 1401 for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) { 1402 if (prevBP == bp) { 1403 // Fail for lack of progress. 1404 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d", 1405 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 1406 break; 1407 } 1408 1409 // Check that there were we didn't miss an expected break between the last one 1410 // and this one. (UVector returns zeros for index out of bounds.) 1411 for (i=prevBP-1; i>bp; i--) { 1412 if (t->expectedBreaks->elementAti(i) != 0) { 1413 errln("Reverse Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1414 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 1415 } 1416 } 1417 1418 // Check that the break we did find was expected 1419 if (t->expectedBreaks->elementAti(bp) == 0) { 1420 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d", 1421 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 1422 } else { 1423 // The break was expected. 1424 // Check that the {nnn} tag value is correct. 1425 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp); 1426 if (expectedTagVal == -1) { 1427 expectedTagVal = 0; 1428 } 1429 int line = t->srcLine->elementAti(bp); 1430 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus(); 1431 if (rs != expectedTagVal) { 1432 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n" 1433 " Actual, Expected status = %4d, %4d", 1434 bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal); 1435 } 1436 } 1437 1438 prevBP = bp; 1439 } 1440 1441 // Verify that there were no missed breaks prior to the last one found 1442 for (i=prevBP-1; i>=0; i--) { 1443 if (t->expectedBreaks->elementAti(i) != 0) { 1444 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1445 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 1446 } 1447 } 1448 } 1449 1450 1451 void RBBITest::TestExtended() { 1452 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 1453 UErrorCode status = U_ZERO_ERROR; 1454 Locale locale(""); 1455 1456 UnicodeString rules; 1457 TestParams tp; 1458 tp.bi = NULL; 1459 tp.expectedBreaks = new UVector32(status); 1460 tp.srcLine = new UVector32(status); 1461 tp.srcCol = new UVector32(status); 1462 1463 RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status); 1464 if (U_FAILURE(status)) { 1465 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status)); 1466 } 1467 1468 1469 // 1470 // Open and read the test data file. 1471 // 1472 const char *testDataDirectory = IntlTest::getSourceTestData(status); 1473 char testFileName[1000]; 1474 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) { 1475 errln("Can't open test data. Path too long."); 1476 return; 1477 } 1478 strcpy(testFileName, testDataDirectory); 1479 strcat(testFileName, "rbbitst.txt"); 1480 1481 int len; 1482 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status); 1483 if (U_FAILURE(status)) { 1484 return; /* something went wrong, error already output */ 1485 } 1486 1487 1488 1489 1490 // 1491 // Put the test data into a UnicodeString 1492 // 1493 UnicodeString testString(FALSE, testFile, len); 1494 1495 enum EParseState{ 1496 PARSE_COMMENT, 1497 PARSE_TAG, 1498 PARSE_DATA, 1499 PARSE_NUM 1500 } 1501 parseState = PARSE_TAG; 1502 1503 EParseState savedState = PARSE_TAG; 1504 1505 static const UChar CH_LF = 0x0a; 1506 static const UChar CH_CR = 0x0d; 1507 static const UChar CH_HASH = 0x23; 1508 /*static const UChar CH_PERIOD = 0x2e;*/ 1509 static const UChar CH_LT = 0x3c; 1510 static const UChar CH_GT = 0x3e; 1511 static const UChar CH_BACKSLASH = 0x5c; 1512 static const UChar CH_BULLET = 0x2022; 1513 1514 int32_t lineNum = 1; 1515 int32_t colStart = 0; 1516 int32_t column = 0; 1517 int32_t charIdx = 0; 1518 1519 int32_t tagValue = 0; // The numeric value of a <nnn> tag. 1520 1521 for (charIdx = 0; charIdx < len; ) { 1522 status = U_ZERO_ERROR; 1523 UChar c = testString.charAt(charIdx); 1524 charIdx++; 1525 if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) { 1526 // treat CRLF as a unit 1527 c = CH_LF; 1528 charIdx++; 1529 } 1530 if (c == CH_LF || c == CH_CR) { 1531 lineNum++; 1532 colStart = charIdx; 1533 } 1534 column = charIdx - colStart + 1; 1535 1536 switch (parseState) { 1537 case PARSE_COMMENT: 1538 if (c == 0x0a || c == 0x0d) { 1539 parseState = savedState; 1540 } 1541 break; 1542 1543 case PARSE_TAG: 1544 { 1545 if (c == CH_HASH) { 1546 parseState = PARSE_COMMENT; 1547 savedState = PARSE_TAG; 1548 break; 1549 } 1550 if (u_isUWhiteSpace(c)) { 1551 break; 1552 } 1553 if (testString.compare(charIdx-1, 6, "<word>") == 0) { 1554 delete tp.bi; 1555 tp.bi = BreakIterator::createWordInstance(locale, status); 1556 charIdx += 5; 1557 break; 1558 } 1559 if (testString.compare(charIdx-1, 6, "<char>") == 0) { 1560 delete tp.bi; 1561 tp.bi = BreakIterator::createCharacterInstance(locale, status); 1562 charIdx += 5; 1563 break; 1564 } 1565 if (testString.compare(charIdx-1, 6, "<line>") == 0) { 1566 delete tp.bi; 1567 tp.bi = BreakIterator::createLineInstance(locale, status); 1568 charIdx += 5; 1569 break; 1570 } 1571 if (testString.compare(charIdx-1, 6, "<sent>") == 0) { 1572 delete tp.bi; 1573 tp.bi = NULL; 1574 tp.bi = BreakIterator::createSentenceInstance(locale, status); 1575 charIdx += 5; 1576 break; 1577 } 1578 if (testString.compare(charIdx-1, 7, "<title>") == 0) { 1579 delete tp.bi; 1580 tp.bi = BreakIterator::createTitleInstance(locale, status); 1581 charIdx += 6; 1582 break; 1583 } 1584 1585 // <locale loc_name> 1586 localeMatcher.reset(testString); 1587 if (localeMatcher.lookingAt(charIdx-1, status)) { 1588 UnicodeString localeName = localeMatcher.group(1, status); 1589 char localeName8[100]; 1590 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0); 1591 locale = Locale::createFromName(localeName8); 1592 charIdx += localeMatcher.group(0, status).length(); 1593 TEST_ASSERT_SUCCESS(status); 1594 break; 1595 } 1596 if (testString.compare(charIdx-1, 6, "<data>") == 0) { 1597 parseState = PARSE_DATA; 1598 charIdx += 5; 1599 tp.dataToBreak = ""; 1600 tp.expectedBreaks->removeAllElements(); 1601 tp.srcCol ->removeAllElements(); 1602 tp.srcLine->removeAllElements(); 1603 break; 1604 } 1605 1606 errln("line %d: Tag expected in test file.", lineNum); 1607 parseState = PARSE_COMMENT; 1608 savedState = PARSE_DATA; 1609 goto end_test; // Stop the test. 1610 } 1611 break; 1612 1613 case PARSE_DATA: 1614 if (c == CH_BULLET) { 1615 int32_t breakIdx = tp.dataToBreak.length(); 1616 tp.expectedBreaks->setSize(breakIdx+1); 1617 tp.expectedBreaks->setElementAt(-1, breakIdx); 1618 tp.srcLine->setSize(breakIdx+1); 1619 tp.srcLine->setElementAt(lineNum, breakIdx); 1620 tp.srcCol ->setSize(breakIdx+1); 1621 tp.srcCol ->setElementAt(column, breakIdx); 1622 break; 1623 } 1624 1625 if (testString.compare(charIdx-1, 7, "</data>") == 0) { 1626 // Add final entry to mappings from break location to source file position. 1627 // Need one extra because last break position returned is after the 1628 // last char in the data, not at the last char. 1629 tp.srcLine->addElement(lineNum, status); 1630 tp.srcCol ->addElement(column, status); 1631 1632 parseState = PARSE_TAG; 1633 charIdx += 6; 1634 1635 // RUN THE TEST! 1636 executeTest(&tp); 1637 break; 1638 } 1639 1640 if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) { 1641 // Named character, e.g. \N{COMBINING GRAVE ACCENT} 1642 // Get the code point from the name and insert it into the test data. 1643 // (Damn, no API takes names in Unicode !!! 1644 // we've got to take it back to char *) 1645 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx); 1646 int32_t nameLength = nameEndIdx - (charIdx+2); 1647 char charNameBuf[200]; 1648 UChar32 theChar = -1; 1649 if (nameEndIdx != -1) { 1650 UErrorCode status = U_ZERO_ERROR; 1651 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf)); 1652 charNameBuf[sizeof(charNameBuf)-1] = 0; 1653 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status); 1654 if (U_FAILURE(status)) { 1655 theChar = -1; 1656 } 1657 } 1658 if (theChar == -1) { 1659 errln("Error in named character in test file at line %d, col %d", 1660 lineNum, column); 1661 } else { 1662 // Named code point was recognized. Insert it 1663 // into the test data. 1664 tp.dataToBreak.append(theChar); 1665 while (tp.dataToBreak.length() > tp.srcLine->size()) { 1666 tp.srcLine->addElement(lineNum, status); 1667 tp.srcCol ->addElement(column, status); 1668 } 1669 } 1670 if (nameEndIdx > charIdx) { 1671 charIdx = nameEndIdx+1; 1672 1673 } 1674 break; 1675 } 1676 1677 1678 1679 1680 if (testString.compare(charIdx-1, 2, "<>") == 0) { 1681 charIdx++; 1682 int32_t breakIdx = tp.dataToBreak.length(); 1683 tp.expectedBreaks->setSize(breakIdx+1); 1684 tp.expectedBreaks->setElementAt(-1, breakIdx); 1685 tp.srcLine->setSize(breakIdx+1); 1686 tp.srcLine->setElementAt(lineNum, breakIdx); 1687 tp.srcCol ->setSize(breakIdx+1); 1688 tp.srcCol ->setElementAt(column, breakIdx); 1689 break; 1690 } 1691 1692 if (c == CH_LT) { 1693 tagValue = 0; 1694 parseState = PARSE_NUM; 1695 break; 1696 } 1697 1698 if (c == CH_HASH && column==3) { // TODO: why is column off so far? 1699 parseState = PARSE_COMMENT; 1700 savedState = PARSE_DATA; 1701 break; 1702 } 1703 1704 if (c == CH_BACKSLASH) { 1705 // Check for \ at end of line, a line continuation. 1706 // Advance over (discard) the newline 1707 UChar32 cp = testString.char32At(charIdx); 1708 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) { 1709 // We have a CR LF 1710 // Need an extra increment of the input ptr to move over both of them 1711 charIdx++; 1712 } 1713 if (cp == CH_LF || cp == CH_CR) { 1714 lineNum++; 1715 colStart = charIdx; 1716 charIdx++; 1717 break; 1718 } 1719 1720 // Let unescape handle the back slash. 1721 cp = testString.unescapeAt(charIdx); 1722 if (cp != -1) { 1723 // Escape sequence was recognized. Insert the char 1724 // into the test data. 1725 tp.dataToBreak.append(cp); 1726 while (tp.dataToBreak.length() > tp.srcLine->size()) { 1727 tp.srcLine->addElement(lineNum, status); 1728 tp.srcCol ->addElement(column, status); 1729 } 1730 break; 1731 } 1732 1733 1734 // Not a recognized backslash escape sequence. 1735 // Take the next char as a literal. 1736 // TODO: Should this be an error? 1737 c = testString.charAt(charIdx); 1738 charIdx = testString.moveIndex32(charIdx, 1); 1739 } 1740 1741 // Normal, non-escaped data char. 1742 tp.dataToBreak.append(c); 1743 1744 // Save the mapping from offset in the data to line/column numbers in 1745 // the original input file. Will be used for better error messages only. 1746 // If there's an expected break before this char, the slot in the mapping 1747 // vector will already be set for this char; don't overwrite it. 1748 if (tp.dataToBreak.length() > tp.srcLine->size()) { 1749 tp.srcLine->addElement(lineNum, status); 1750 tp.srcCol ->addElement(column, status); 1751 } 1752 break; 1753 1754 1755 case PARSE_NUM: 1756 // We are parsing an expected numeric tag value, like <1234>, 1757 // within a chunk of data. 1758 if (u_isUWhiteSpace(c)) { 1759 break; 1760 } 1761 1762 if (c == CH_GT) { 1763 // Finished the number. Add the info to the expected break data, 1764 // and switch parse state back to doing plain data. 1765 parseState = PARSE_DATA; 1766 if (tagValue == 0) { 1767 tagValue = -1; 1768 } 1769 int32_t breakIdx = tp.dataToBreak.length(); 1770 tp.expectedBreaks->setSize(breakIdx+1); 1771 tp.expectedBreaks->setElementAt(tagValue, breakIdx); 1772 tp.srcLine->setSize(breakIdx+1); 1773 tp.srcLine->setElementAt(lineNum, breakIdx); 1774 tp.srcCol ->setSize(breakIdx+1); 1775 tp.srcCol ->setElementAt(column, breakIdx); 1776 break; 1777 } 1778 1779 if (u_isdigit(c)) { 1780 tagValue = tagValue*10 + u_charDigitValue(c); 1781 break; 1782 } 1783 1784 errln("Syntax Error in test file at line %d, col %d", 1785 lineNum, column); 1786 parseState = PARSE_COMMENT; 1787 goto end_test; // Stop the test 1788 break; 1789 } 1790 1791 1792 if (U_FAILURE(status)) { 1793 errln("ICU Error %s while parsing test file at line %d.", 1794 u_errorName(status), lineNum); 1795 status = U_ZERO_ERROR; 1796 goto end_test; // Stop the test 1797 } 1798 1799 } 1800 1801 end_test: 1802 delete tp.bi; 1803 delete tp.expectedBreaks; 1804 delete tp.srcLine; 1805 delete tp.srcCol; 1806 delete [] testFile; 1807 #endif 1808 } 1809 1810 void RBBITest::TestThaiBreaks() { 1811 UErrorCode status=U_ZERO_ERROR; 1812 BreakIterator* b; 1813 Locale locale = Locale("th"); 1814 int32_t p, index; 1815 UChar c[]= { 1816 0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B, 1817 0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19, 1818 0x0E16, 0x0E49, 0x0E33, 0x0000 1819 }; 1820 int32_t expectedWordResult[] = { 1821 2, 3, 6, 10, 11, 15, 17, 20, 22 1822 }; 1823 int32_t expectedLineResult[] = { 1824 3, 6, 11, 15, 17, 20, 22 1825 }; 1826 1827 int32_t size = u_strlen(c); 1828 UnicodeString text=UnicodeString(c); 1829 1830 b = BreakIterator::createWordInstance(locale, status); 1831 if (U_FAILURE(status)) { 1832 errcheckln(status, "Unable to create thai word break iterator. - %s", u_errorName(status)); 1833 return; 1834 } 1835 b->setText(text); 1836 p = index = 0; 1837 while ((p=b->next())!=BreakIterator::DONE && p < size) { 1838 if (p != expectedWordResult[index++]) { 1839 errln("Incorrect break given by thai word break iterator. Expected: %d Got: %d", expectedWordResult[index-1], p); 1840 } 1841 } 1842 delete b; 1843 1844 b = BreakIterator::createLineInstance(locale, status); 1845 if (U_FAILURE(status)) { 1846 printf("Unable to create thai line break iterator.\n"); 1847 return; 1848 } 1849 b->setText(text); 1850 p = index = 0; 1851 while ((p=b->next())!=BreakIterator::DONE && p < size) { 1852 if (p != expectedLineResult[index++]) { 1853 errln("Incorrect break given by thai line break iterator. Expected: %d Got: %d", expectedLineResult[index-1], p); 1854 } 1855 } 1856 1857 delete b; 1858 } 1859 1860 // UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX" 1861 // Words don't include colon or period (cldrbug #1969). 1862 static const char posxWordText[] = "Can't have breaks in xx:yy or struct.field for CS-types."; 1863 static const int32_t posxWordTOffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 23, 24, 26, 27, 29, 30, 36, 37, 42, 43, 46, 47, 49, 50, 55, 56 }; 1864 static const int32_t posxWordROffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 26, 27, 29, 30, 42, 43, 46, 47, 49, 50, 55, 56 }; 1865 1866 // UBreakIteratorType UBRK_WORD, Locale "ja" 1867 // Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009). 1868 static const char jaWordText[] = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF" 1869 "\\u304C\\u3042\\u308B\\u3002\\u5948\\u3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002"; 1870 static const int32_t jaWordTOffsets[] = { 2, 3, 7, 8, 14, 17, 18, 20, 21, 24, 27, 28 }; 1871 static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28 }; 1872 1873 // UBreakIteratorType UBRK_SENTENCE, Locale "el" 1874 // Add break after Greek question mark (cldrbug #2069). 1875 static const char elSentText[] = "\\u0391\\u03B2, \\u03B3\\u03B4; \\u0395 \\u03B6\\u03B7\\u037E \\u0398 \\u03B9\\u03BA. " 1876 "\\u039B\\u03BC \\u03BD\\u03BE! \\u039F\\u03C0, \\u03A1\\u03C2? \\u03A3"; 1877 static const int32_t elSentTOffsets[] = { 8, 14, 20, 27, 35, 36 }; 1878 static const int32_t elSentROffsets[] = { 20, 27, 35, 36 }; 1879 1880 // UBreakIteratorType UBRK_CHARACTER, Locale "th" 1881 // Clusters should not include spacing Thai/Lao vowels (prefix or postfix), except for [SARA] AM (cldrbug #2161). 1882 static const char thCharText[] = "\\u0E01\\u0E23\\u0E30\\u0E17\\u0E48\\u0E2D\\u0E21\\u0E23\\u0E08\\u0E19\\u0E32 " 1883 "(\\u0E2A\\u0E38\\u0E0A\\u0E32\\u0E15\\u0E34-\\u0E08\\u0E38\\u0E11\\u0E32\\u0E21\\u0E32\\u0E28) " 1884 "\\u0E40\\u0E14\\u0E47\\u0E01\\u0E21\\u0E35\\u0E1B\\u0E31\\u0E0D\\u0E2B\\u0E32 "; 1885 static const int32_t thCharTOffsets[] = { 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 1886 12, 13, 15, 16, 17, 19, 20, 22, 23, 24, 25, 26, 27, 28, 1887 29, 30, 32, 33, 35, 37, 38, 39, 40, 41 }; 1888 static const int32_t thCharROffsets[] = { 1, 3, 5, 6, 7, 8, 9, 11, 1889 12, 13, 15, 17, 19, 20, 22, 24, 26, 27, 28, 1890 29, 32, 33, 35, 37, 38, 40, 41 }; 1891 1892 typedef struct { 1893 UBreakIteratorType type; 1894 const char * locale; 1895 const char * escapedText; 1896 const int32_t * tailoredOffsets; 1897 int32_t tailoredOffsetsCount; 1898 const int32_t * rootOffsets; 1899 int32_t rootOffsetsCount; 1900 } TailoredBreakItem; 1901 1902 #define ARRAY_PTR_LEN(array) (array),(sizeof(array)/sizeof(array[0])) 1903 1904 static const TailoredBreakItem tbItems[] = { 1905 { UBRK_WORD, "en_US_POSIX", posxWordText, ARRAY_PTR_LEN(posxWordTOffsets), ARRAY_PTR_LEN(posxWordROffsets) }, 1906 { UBRK_WORD, "ja", jaWordText, ARRAY_PTR_LEN(jaWordTOffsets), ARRAY_PTR_LEN(jaWordROffsets) }, 1907 { UBRK_SENTENCE, "el", elSentText, ARRAY_PTR_LEN(elSentTOffsets), ARRAY_PTR_LEN(elSentROffsets) }, 1908 { UBRK_CHARACTER, "th", thCharText, ARRAY_PTR_LEN(thCharTOffsets), ARRAY_PTR_LEN(thCharROffsets) }, 1909 { UBRK_CHARACTER, NULL, NULL, NULL,0, NULL,0 } // terminator 1910 }; 1911 1912 static void formatOffsets(char* buffer, int32_t buflen, int32_t count, const int32_t* offsets) { 1913 while (count-- > 0) { 1914 int writeCount; 1915 sprintf(buffer, /* buflen, */ " %d%n", *offsets++, &writeCount); /* wants to be snprintf */ 1916 buffer += writeCount; 1917 buflen -= writeCount; 1918 } 1919 } 1920 1921 enum { kMaxOffsetCount = 128 }; 1922 1923 void RBBITest::TBTest(BreakIterator* brkitr, int type, const char *locale, const char* escapedText, const int32_t *expectOffsets, int32_t expectOffsetsCount) { 1924 brkitr->setText( CharsToUnicodeString(escapedText) ); 1925 int32_t foundOffsets[kMaxOffsetCount]; 1926 int32_t offset, foundOffsetsCount = 0; 1927 // do forwards iteration test 1928 while ( foundOffsetsCount < kMaxOffsetCount && (offset = brkitr->next()) != BreakIterator::DONE ) { 1929 foundOffsets[foundOffsetsCount++] = offset; 1930 } 1931 if ( foundOffsetsCount != expectOffsetsCount || memcmp(expectOffsets, foundOffsets, foundOffsetsCount*sizeof(foundOffsets[0])) != 0 ) { 1932 // log error for forwards test 1933 char formatExpect[512], formatFound[512]; 1934 formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets); 1935 formatOffsets(formatFound, sizeof(formatFound), foundOffsetsCount, foundOffsets); 1936 errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found %d offsets fwd:%s\n", 1937 type, locale, escapedText, expectOffsetsCount, formatExpect, foundOffsetsCount, formatFound); 1938 } else { 1939 // do backwards iteration test 1940 --foundOffsetsCount; // back off one from the end offset 1941 while ( foundOffsetsCount > 0 ) { 1942 offset = brkitr->previous(); 1943 if ( offset != foundOffsets[--foundOffsetsCount] ) { 1944 // log error for backwards test 1945 char formatExpect[512]; 1946 formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets); 1947 errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found rev offset %d where expect %d\n", 1948 type, locale, escapedText, expectOffsetsCount, formatExpect, offset, foundOffsets[foundOffsetsCount]); 1949 break; 1950 } 1951 } 1952 } 1953 } 1954 1955 void RBBITest::TestTailoredBreaks() { 1956 const TailoredBreakItem * tbItemPtr; 1957 Locale rootLocale = Locale("root"); 1958 for (tbItemPtr = tbItems; tbItemPtr->escapedText != NULL; ++tbItemPtr) { 1959 Locale testLocale = Locale(tbItemPtr->locale); 1960 BreakIterator * tailoredBrkiter; 1961 BreakIterator * rootBrkiter; 1962 UErrorCode status = U_ZERO_ERROR; 1963 switch (tbItemPtr->type) { 1964 case UBRK_CHARACTER: 1965 tailoredBrkiter = BreakIterator::createCharacterInstance(testLocale, status); 1966 rootBrkiter = BreakIterator::createCharacterInstance(rootLocale, status); 1967 break; 1968 case UBRK_WORD: 1969 tailoredBrkiter = BreakIterator::createWordInstance(testLocale, status); 1970 rootBrkiter = BreakIterator::createWordInstance(rootLocale, status); 1971 break; 1972 case UBRK_LINE: 1973 tailoredBrkiter = BreakIterator::createLineInstance(testLocale, status); 1974 rootBrkiter = BreakIterator::createLineInstance(rootLocale, status); 1975 break; 1976 case UBRK_SENTENCE: 1977 tailoredBrkiter = BreakIterator::createSentenceInstance(testLocale, status); 1978 rootBrkiter = BreakIterator::createSentenceInstance(rootLocale, status); 1979 break; 1980 default: 1981 status = U_UNSUPPORTED_ERROR; 1982 break; 1983 } 1984 if (U_FAILURE(status)) { 1985 errcheckln(status, "BreakIterator create failed for type %d, locales root or %s - Error: %s", (int)(tbItemPtr->type), tbItemPtr->locale, u_errorName(status)); 1986 continue; 1987 } 1988 TBTest(tailoredBrkiter, (int)(tbItemPtr->type), tbItemPtr->locale, tbItemPtr->escapedText, tbItemPtr->tailoredOffsets, tbItemPtr->tailoredOffsetsCount); 1989 TBTest(rootBrkiter, (int)(tbItemPtr->type), "root", tbItemPtr->escapedText, tbItemPtr->rootOffsets, tbItemPtr->rootOffsetsCount); 1990 1991 delete rootBrkiter; 1992 delete tailoredBrkiter; 1993 } 1994 } 1995 1996 1997 //------------------------------------------------------------------------------- 1998 // 1999 // TestDictRules create a break iterator from source rules that includes a 2000 // dictionary range. Regression for bug #7130. Source rules 2001 // do not declare a break iterator type (word, line, sentence, etc. 2002 // but the dictionary code, without a type, would loop. 2003 // 2004 //------------------------------------------------------------------------------- 2005 void RBBITest::TestDictRules() { 2006 const char *rules = "$dictionary = [a-z]; \n" 2007 "!!forward; \n" 2008 "$dictionary $dictionary; \n" 2009 "!!reverse; \n" 2010 "$dictionary $dictionary; \n"; 2011 const char *text = "aa"; 2012 UErrorCode status = U_ZERO_ERROR; 2013 UParseError parseError; 2014 2015 RuleBasedBreakIterator bi(rules, parseError, status); 2016 if (U_SUCCESS(status)) { 2017 UnicodeString utext = text; 2018 bi.setText(utext); 2019 int32_t position; 2020 int32_t loops; 2021 for (loops = 0; loops<10; loops++) { 2022 position = bi.next(); 2023 if (position == RuleBasedBreakIterator::DONE) { 2024 break; 2025 } 2026 } 2027 TEST_ASSERT(loops == 1); 2028 } else { 2029 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status)); 2030 } 2031 } 2032 2033 2034 2035 //------------------------------------------------------------------------------- 2036 // 2037 // ReadAndConvertFile Read a text data file, convert it to UChars, and 2038 // return the datain one big UChar * buffer, which the caller must delete. 2039 // 2040 // parameters: 2041 // fileName: the name of the file, with no directory part. The test data directory 2042 // is assumed. 2043 // ulen an out parameter, receives the actual length (in UChars) of the file data. 2044 // encoding The file encoding. If the file contains a BOM, that will override the encoding 2045 // specified here. The BOM, if it exists, will be stripped from the returned data. 2046 // Pass NULL for the system default encoding. 2047 // status 2048 // returns: 2049 // The file data, converted to UChar. 2050 // The caller must delete this when done with 2051 // delete [] theBuffer; 2052 // 2053 // TODO: This is a clone of RegexTest::ReadAndConvertFile. 2054 // Move this function to some common place. 2055 // 2056 //-------------------------------------------------------------------------------- 2057 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) { 2058 UChar *retPtr = NULL; 2059 char *fileBuf = NULL; 2060 UConverter* conv = NULL; 2061 FILE *f = NULL; 2062 2063 ulen = 0; 2064 if (U_FAILURE(status)) { 2065 return retPtr; 2066 } 2067 2068 // 2069 // Open the file. 2070 // 2071 f = fopen(fileName, "rb"); 2072 if (f == 0) { 2073 dataerrln("Error opening test data file %s\n", fileName); 2074 status = U_FILE_ACCESS_ERROR; 2075 return NULL; 2076 } 2077 // 2078 // Read it in 2079 // 2080 int fileSize; 2081 int amt_read; 2082 2083 fseek( f, 0, SEEK_END); 2084 fileSize = ftell(f); 2085 fileBuf = new char[fileSize]; 2086 fseek(f, 0, SEEK_SET); 2087 amt_read = fread(fileBuf, 1, fileSize, f); 2088 if (amt_read != fileSize || fileSize <= 0) { 2089 errln("Error reading test data file."); 2090 goto cleanUpAndReturn; 2091 } 2092 2093 // 2094 // Look for a Unicode Signature (BOM) on the data just read 2095 // 2096 int32_t signatureLength; 2097 const char * fileBufC; 2098 const char* bomEncoding; 2099 2100 fileBufC = fileBuf; 2101 bomEncoding = ucnv_detectUnicodeSignature( 2102 fileBuf, fileSize, &signatureLength, &status); 2103 if(bomEncoding!=NULL ){ 2104 fileBufC += signatureLength; 2105 fileSize -= signatureLength; 2106 encoding = bomEncoding; 2107 } 2108 2109 // 2110 // Open a converter to take the rule file to UTF-16 2111 // 2112 conv = ucnv_open(encoding, &status); 2113 if (U_FAILURE(status)) { 2114 goto cleanUpAndReturn; 2115 } 2116 2117 // 2118 // Convert the rules to UChar. 2119 // Preflight first to determine required buffer size. 2120 // 2121 ulen = ucnv_toUChars(conv, 2122 NULL, // dest, 2123 0, // destCapacity, 2124 fileBufC, 2125 fileSize, 2126 &status); 2127 if (status == U_BUFFER_OVERFLOW_ERROR) { 2128 // Buffer Overflow is expected from the preflight operation. 2129 status = U_ZERO_ERROR; 2130 2131 retPtr = new UChar[ulen+1]; 2132 ucnv_toUChars(conv, 2133 retPtr, // dest, 2134 ulen+1, 2135 fileBufC, 2136 fileSize, 2137 &status); 2138 } 2139 2140 cleanUpAndReturn: 2141 fclose(f); 2142 delete []fileBuf; 2143 ucnv_close(conv); 2144 if (U_FAILURE(status)) { 2145 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 2146 delete retPtr; 2147 retPtr = 0; 2148 ulen = 0; 2149 }; 2150 return retPtr; 2151 } 2152 2153 2154 2155 //-------------------------------------------------------------------------------------------- 2156 // 2157 // Run tests from each of the boundary test data files distributed by the Unicode Consortium 2158 // 2159 //------------------------------------------------------------------------------------------- 2160 void RBBITest::TestUnicodeFiles() { 2161 RuleBasedBreakIterator *bi; 2162 UErrorCode status = U_ZERO_ERROR; 2163 2164 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getDefault(), status); 2165 TEST_ASSERT_SUCCESS(status); 2166 if (U_SUCCESS(status)) { 2167 runUnicodeTestData("GraphemeBreakTest.txt", bi); 2168 } 2169 delete bi; 2170 2171 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getDefault(), status); 2172 TEST_ASSERT_SUCCESS(status); 2173 if (U_SUCCESS(status)) { 2174 runUnicodeTestData("WordBreakTest.txt", bi); 2175 } 2176 delete bi; 2177 2178 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status); 2179 TEST_ASSERT_SUCCESS(status); 2180 if (U_SUCCESS(status)) { 2181 runUnicodeTestData("SentenceBreakTest.txt", bi); 2182 } 2183 delete bi; 2184 2185 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); 2186 TEST_ASSERT_SUCCESS(status); 2187 if (U_SUCCESS(status)) { 2188 runUnicodeTestData("LineBreakTest.txt", bi); 2189 } 2190 delete bi; 2191 } 2192 2193 2194 //-------------------------------------------------------------------------------------------- 2195 // 2196 // Run tests from one of the boundary test data files distributed by the Unicode Consortium 2197 // 2198 //------------------------------------------------------------------------------------------- 2199 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) { 2200 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 2201 UErrorCode status = U_ZERO_ERROR; 2202 2203 // 2204 // Open and read the test data file, put it into a UnicodeString. 2205 // 2206 const char *testDataDirectory = IntlTest::getSourceTestData(status); 2207 char testFileName[1000]; 2208 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) { 2209 dataerrln("Can't open test data. Path too long."); 2210 return; 2211 } 2212 strcpy(testFileName, testDataDirectory); 2213 strcat(testFileName, fileName); 2214 2215 logln("Opening data file %s\n", fileName); 2216 2217 int len; 2218 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status); 2219 if (status != U_FILE_ACCESS_ERROR) { 2220 TEST_ASSERT_SUCCESS(status); 2221 TEST_ASSERT(testFile != NULL); 2222 } 2223 if (U_FAILURE(status) || testFile == NULL) { 2224 return; /* something went wrong, error already output */ 2225 } 2226 UnicodeString testFileAsString(TRUE, testFile, len); 2227 2228 // 2229 // Parse the test data file using a regular expression. 2230 // Each kind of token is recognized in its own capture group; what type of item was scanned 2231 // is identified by which group had a match. 2232 // 2233 // Caputure Group # 1 2 3 4 5 2234 // Parses this item: divide x hex digits comment \n unrecognized \n 2235 // 2236 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV); 2237 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status); 2238 UnicodeString testString; 2239 UVector32 breakPositions(status); 2240 int lineNumber = 1; 2241 TEST_ASSERT_SUCCESS(status); 2242 if (U_FAILURE(status)) { 2243 return; 2244 } 2245 2246 // 2247 // Scan through each test case, building up the string to be broken in testString, 2248 // and the positions that should be boundaries in the breakPositions vector. 2249 // 2250 while (tokenMatcher.find()) { 2251 if (tokenMatcher.start(1, status) >= 0) { 2252 // Scanned a divide sign, indicating a break position in the test data. 2253 if (testString.length()>0) { 2254 breakPositions.addElement(testString.length(), status); 2255 } 2256 } 2257 else if (tokenMatcher.start(2, status) >= 0) { 2258 // Scanned an 'x', meaning no break at this position in the test data 2259 // Nothing to be done here. 2260 } 2261 else if (tokenMatcher.start(3, status) >= 0) { 2262 // Scanned Hex digits. Convert them to binary, append to the character data string. 2263 const UnicodeString &hexNumber = tokenMatcher.group(3, status); 2264 int length = hexNumber.length(); 2265 if (length<=8) { 2266 char buf[10]; 2267 hexNumber.extract (0, length, buf, sizeof(buf), US_INV); 2268 UChar32 c = (UChar32)strtol(buf, NULL, 16); 2269 if (c<=0x10ffff) { 2270 testString.append(c); 2271 } else { 2272 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n", 2273 fileName, lineNumber); 2274 } 2275 } else { 2276 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n", 2277 fileName, lineNumber); 2278 } 2279 } 2280 else if (tokenMatcher.start(4, status) >= 0) { 2281 // Scanned to end of a line, possibly skipping over a comment in the process. 2282 // If the line from the file contained test data, run the test now. 2283 // 2284 if (testString.length() > 0) { 2285 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi); 2286 } 2287 2288 // Clear out this test case. 2289 // The string and breakPositions vector will be refilled as the next 2290 // test case is parsed. 2291 testString.remove(); 2292 breakPositions.removeAllElements(); 2293 lineNumber++; 2294 } else { 2295 // Scanner catchall. Something unrecognized appeared on the line. 2296 char token[16]; 2297 UnicodeString uToken = tokenMatcher.group(0, status); 2298 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token)); 2299 token[sizeof(token)-1] = 0; 2300 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token); 2301 2302 // Clean up, in preparation for continuing with the next line. 2303 testString.remove(); 2304 breakPositions.removeAllElements(); 2305 lineNumber++; 2306 } 2307 TEST_ASSERT_SUCCESS(status); 2308 if (U_FAILURE(status)) { 2309 break; 2310 } 2311 } 2312 2313 delete [] testFile; 2314 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 2315 } 2316 2317 //-------------------------------------------------------------------------------------------- 2318 // 2319 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium 2320 // test data files. Do only a simple, forward-only check - 2321 // this test is mostly to check that ICU and the Unicode 2322 // data agree with each other. 2323 // 2324 //-------------------------------------------------------------------------------------------- 2325 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber, 2326 const UnicodeString &testString, // Text data to be broken 2327 UVector32 *breakPositions, // Positions where breaks should be found. 2328 RuleBasedBreakIterator *bi) { 2329 int32_t pos; // Break Position in the test string 2330 int32_t expectedI = 0; // Index of expected break position in the vector of expected results. 2331 int32_t expectedPos; // Expected break position (index into test string) 2332 2333 bi->setText(testString); 2334 pos = bi->first(); 2335 pos = bi->next(); 2336 2337 while (pos != BreakIterator::DONE) { 2338 if (expectedI >= breakPositions->size()) { 2339 errln("Test file \"%s\", line %d, unexpected break found at position %d", 2340 testFileName, lineNumber, pos); 2341 break; 2342 } 2343 expectedPos = breakPositions->elementAti(expectedI); 2344 if (pos < expectedPos) { 2345 errln("Test file \"%s\", line %d, unexpected break found at position %d", 2346 testFileName, lineNumber, pos); 2347 break; 2348 } 2349 if (pos > expectedPos) { 2350 errln("Test file \"%s\", line %d, failed to find expected break at position %d", 2351 testFileName, lineNumber, expectedPos); 2352 break; 2353 } 2354 pos = bi->next(); 2355 expectedI++; 2356 } 2357 2358 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) { 2359 errln("Test file \"%s\", line %d, failed to find expected break at position %d", 2360 testFileName, lineNumber, breakPositions->elementAti(expectedI)); 2361 } 2362 } 2363 2364 2365 2366 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 2367 //--------------------------------------------------------------------------------------- 2368 // 2369 // classs RBBIMonkeyKind 2370 // 2371 // Monkey Test for Break Iteration 2372 // Abstract interface class. Concrete derived classes independently 2373 // implement the break rules for different iterator types. 2374 // 2375 // The Monkey Test itself uses doesn't know which type of break iterator it is 2376 // testing, but works purely in terms of the interface defined here. 2377 // 2378 //--------------------------------------------------------------------------------------- 2379 class RBBIMonkeyKind { 2380 public: 2381 // Return a UVector of UnicodeSets, representing the character classes used 2382 // for this type of iterator. 2383 virtual UVector *charClasses() = 0; 2384 2385 // Set the test text on which subsequent calls to next() will operate 2386 virtual void setText(const UnicodeString &s) = 0; 2387 2388 // Find the next break postion, starting from the prev break position, or from zero. 2389 // Return -1 after reaching end of string. 2390 virtual int32_t next(int32_t i) = 0; 2391 2392 virtual ~RBBIMonkeyKind(); 2393 UErrorCode deferredStatus; 2394 2395 2396 protected: 2397 RBBIMonkeyKind(); 2398 2399 private: 2400 }; 2401 2402 RBBIMonkeyKind::RBBIMonkeyKind() { 2403 deferredStatus = U_ZERO_ERROR; 2404 } 2405 2406 RBBIMonkeyKind::~RBBIMonkeyKind() { 2407 } 2408 2409 2410 //---------------------------------------------------------------------------------------- 2411 // 2412 // Random Numbers. Similar to standard lib rand() and srand() 2413 // Not using library to 2414 // 1. Get same results on all platforms. 2415 // 2. Get access to current seed, to more easily reproduce failures. 2416 // 2417 //--------------------------------------------------------------------------------------- 2418 static uint32_t m_seed = 1; 2419 2420 static uint32_t m_rand() 2421 { 2422 m_seed = m_seed * 1103515245 + 12345; 2423 return (uint32_t)(m_seed/65536) % 32768; 2424 } 2425 2426 2427 //------------------------------------------------------------------------------------------ 2428 // 2429 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation 2430 // of RBBIMonkeyKind. 2431 // 2432 //------------------------------------------------------------------------------------------ 2433 class RBBICharMonkey: public RBBIMonkeyKind { 2434 public: 2435 RBBICharMonkey(); 2436 virtual ~RBBICharMonkey(); 2437 virtual UVector *charClasses(); 2438 virtual void setText(const UnicodeString &s); 2439 virtual int32_t next(int32_t i); 2440 private: 2441 UVector *fSets; 2442 2443 UnicodeSet *fCRLFSet; 2444 UnicodeSet *fControlSet; 2445 UnicodeSet *fExtendSet; 2446 UnicodeSet *fPrependSet; 2447 UnicodeSet *fSpacingSet; 2448 UnicodeSet *fLSet; 2449 UnicodeSet *fVSet; 2450 UnicodeSet *fTSet; 2451 UnicodeSet *fLVSet; 2452 UnicodeSet *fLVTSet; 2453 UnicodeSet *fHangulSet; 2454 UnicodeSet *fAnySet; 2455 2456 const UnicodeString *fText; 2457 }; 2458 2459 2460 RBBICharMonkey::RBBICharMonkey() { 2461 UErrorCode status = U_ZERO_ERROR; 2462 2463 fText = NULL; 2464 2465 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status); 2466 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status); 2467 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status); 2468 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status); 2469 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status); 2470 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status); 2471 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status); 2472 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status); 2473 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status); 2474 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status); 2475 fHangulSet = new UnicodeSet(); 2476 fHangulSet->addAll(*fLSet); 2477 fHangulSet->addAll(*fVSet); 2478 fHangulSet->addAll(*fTSet); 2479 fHangulSet->addAll(*fLVSet); 2480 fHangulSet->addAll(*fLVTSet); 2481 fAnySet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u0000-\\U0010ffff]"), status); 2482 2483 fSets = new UVector(status); 2484 fSets->addElement(fCRLFSet, status); 2485 fSets->addElement(fControlSet, status); 2486 fSets->addElement(fExtendSet, status); 2487 fSets->addElement(fPrependSet, status); 2488 fSets->addElement(fSpacingSet, status); 2489 fSets->addElement(fHangulSet, status); 2490 fSets->addElement(fAnySet, status); 2491 if (U_FAILURE(status)) { 2492 deferredStatus = status; 2493 } 2494 } 2495 2496 2497 void RBBICharMonkey::setText(const UnicodeString &s) { 2498 fText = &s; 2499 } 2500 2501 2502 2503 int32_t RBBICharMonkey::next(int32_t prevPos) { 2504 int p0, p1, p2, p3; // Indices of the significant code points around the 2505 // break position being tested. The candidate break 2506 // location is before p2. 2507 2508 int breakPos = -1; 2509 2510 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 2511 2512 if (U_FAILURE(deferredStatus)) { 2513 return -1; 2514 } 2515 2516 // Previous break at end of string. return DONE. 2517 if (prevPos >= fText->length()) { 2518 return -1; 2519 } 2520 p0 = p1 = p2 = p3 = prevPos; 2521 c3 = fText->char32At(prevPos); 2522 c0 = c1 = c2 = 0; 2523 2524 // Loop runs once per "significant" character position in the input text. 2525 for (;;) { 2526 // Move all of the positions forward in the input string. 2527 p0 = p1; c0 = c1; 2528 p1 = p2; c1 = c2; 2529 p2 = p3; c2 = c3; 2530 2531 // Advancd p3 by one codepoint 2532 p3 = fText->moveIndex32(p3, 1); 2533 c3 = fText->char32At(p3); 2534 2535 if (p1 == p2) { 2536 // Still warming up the loop. (won't work with zero length strings, but we don't care) 2537 continue; 2538 } 2539 if (p2 == fText->length()) { 2540 // Reached end of string. Always a break position. 2541 break; 2542 } 2543 2544 // Rule GB3 CR x LF 2545 // No Extend or Format characters may appear between the CR and LF, 2546 // which requires the additional check for p2 immediately following p1. 2547 // 2548 if (c1==0x0D && c2==0x0A && p1==(p2-1)) { 2549 continue; 2550 } 2551 2552 // Rule (GB4). ( Control | CR | LF ) <break> 2553 if (fControlSet->contains(c1) || 2554 c1 == 0x0D || 2555 c1 == 0x0A) { 2556 break; 2557 } 2558 2559 // Rule (GB5) <break> ( Control | CR | LF ) 2560 // 2561 if (fControlSet->contains(c2) || 2562 c2 == 0x0D || 2563 c2 == 0x0A) { 2564 break; 2565 } 2566 2567 2568 // Rule (GB6) L x ( L | V | LV | LVT ) 2569 if (fLSet->contains(c1) && 2570 (fLSet->contains(c2) || 2571 fVSet->contains(c2) || 2572 fLVSet->contains(c2) || 2573 fLVTSet->contains(c2))) { 2574 continue; 2575 } 2576 2577 // Rule (GB7) ( LV | V ) x ( V | T ) 2578 if ((fLVSet->contains(c1) || fVSet->contains(c1)) && 2579 (fVSet->contains(c2) || fTSet->contains(c2))) { 2580 continue; 2581 } 2582 2583 // Rule (GB8) ( LVT | T) x T 2584 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) && 2585 fTSet->contains(c2)) { 2586 continue; 2587 } 2588 2589 // Rule (GB9) Numeric x ALetter 2590 if (fExtendSet->contains(c2)) { 2591 continue; 2592 } 2593 2594 // Rule (GB9a) x SpacingMark 2595 if (fSpacingSet->contains(c2)) { 2596 continue; 2597 } 2598 2599 // Rule (GB9b) Prepend x 2600 if (fPrependSet->contains(c1)) { 2601 continue; 2602 } 2603 2604 // Rule (GB10) Any <break> Any 2605 break; 2606 } 2607 2608 breakPos = p2; 2609 return breakPos; 2610 } 2611 2612 2613 2614 UVector *RBBICharMonkey::charClasses() { 2615 return fSets; 2616 } 2617 2618 2619 RBBICharMonkey::~RBBICharMonkey() { 2620 delete fSets; 2621 delete fCRLFSet; 2622 delete fControlSet; 2623 delete fExtendSet; 2624 delete fPrependSet; 2625 delete fSpacingSet; 2626 delete fLSet; 2627 delete fVSet; 2628 delete fTSet; 2629 delete fLVSet; 2630 delete fLVTSet; 2631 delete fHangulSet; 2632 delete fAnySet; 2633 } 2634 2635 //------------------------------------------------------------------------------------------ 2636 // 2637 // class RBBIWordMonkey Word Break specific implementation 2638 // of RBBIMonkeyKind. 2639 // 2640 //------------------------------------------------------------------------------------------ 2641 class RBBIWordMonkey: public RBBIMonkeyKind { 2642 public: 2643 RBBIWordMonkey(); 2644 virtual ~RBBIWordMonkey(); 2645 virtual UVector *charClasses(); 2646 virtual void setText(const UnicodeString &s); 2647 virtual int32_t next(int32_t i); 2648 private: 2649 UVector *fSets; 2650 2651 UnicodeSet *fCRSet; 2652 UnicodeSet *fLFSet; 2653 UnicodeSet *fNewlineSet; 2654 UnicodeSet *fKatakanaSet; 2655 UnicodeSet *fALetterSet; 2656 UnicodeSet *fMidNumLetSet; 2657 UnicodeSet *fMidLetterSet; 2658 UnicodeSet *fMidNumSet; 2659 UnicodeSet *fNumericSet; 2660 UnicodeSet *fFormatSet; 2661 UnicodeSet *fOtherSet; 2662 UnicodeSet *fExtendSet; 2663 UnicodeSet *fExtendNumLetSet; 2664 2665 RegexMatcher *fMatcher; 2666 2667 const UnicodeString *fText; 2668 }; 2669 2670 2671 RBBIWordMonkey::RBBIWordMonkey() 2672 { 2673 UErrorCode status = U_ZERO_ERROR; 2674 2675 fSets = new UVector(status); 2676 2677 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status); 2678 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status); 2679 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status); 2680 fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status); 2681 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status); 2682 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status); 2683 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status); 2684 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status); 2685 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status); 2686 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status); 2687 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status); 2688 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status); 2689 2690 fOtherSet = new UnicodeSet(); 2691 if(U_FAILURE(status)) { 2692 deferredStatus = status; 2693 return; 2694 } 2695 2696 fOtherSet->complement(); 2697 fOtherSet->removeAll(*fCRSet); 2698 fOtherSet->removeAll(*fLFSet); 2699 fOtherSet->removeAll(*fNewlineSet); 2700 fOtherSet->removeAll(*fKatakanaSet); 2701 fOtherSet->removeAll(*fALetterSet); 2702 fOtherSet->removeAll(*fMidLetterSet); 2703 fOtherSet->removeAll(*fMidNumSet); 2704 fOtherSet->removeAll(*fNumericSet); 2705 fOtherSet->removeAll(*fExtendNumLetSet); 2706 fOtherSet->removeAll(*fFormatSet); 2707 fOtherSet->removeAll(*fExtendSet); 2708 // Inhibit dictionary characters from being tested at all. 2709 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status)); 2710 2711 fSets->addElement(fCRSet, status); 2712 fSets->addElement(fLFSet, status); 2713 fSets->addElement(fNewlineSet, status); 2714 fSets->addElement(fALetterSet, status); 2715 fSets->addElement(fKatakanaSet, status); 2716 fSets->addElement(fMidLetterSet, status); 2717 fSets->addElement(fMidNumLetSet, status); 2718 fSets->addElement(fMidNumSet, status); 2719 fSets->addElement(fNumericSet, status); 2720 fSets->addElement(fFormatSet, status); 2721 fSets->addElement(fExtendSet, status); 2722 fSets->addElement(fOtherSet, status); 2723 fSets->addElement(fExtendNumLetSet, status); 2724 2725 if (U_FAILURE(status)) { 2726 deferredStatus = status; 2727 } 2728 } 2729 2730 void RBBIWordMonkey::setText(const UnicodeString &s) { 2731 fText = &s; 2732 } 2733 2734 2735 int32_t RBBIWordMonkey::next(int32_t prevPos) { 2736 int p0, p1, p2, p3; // Indices of the significant code points around the 2737 // break position being tested. The candidate break 2738 // location is before p2. 2739 2740 int breakPos = -1; 2741 2742 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 2743 2744 if (U_FAILURE(deferredStatus)) { 2745 return -1; 2746 } 2747 2748 // Prev break at end of string. return DONE. 2749 if (prevPos >= fText->length()) { 2750 return -1; 2751 } 2752 p0 = p1 = p2 = p3 = prevPos; 2753 c3 = fText->char32At(prevPos); 2754 c0 = c1 = c2 = 0; 2755 2756 // Loop runs once per "significant" character position in the input text. 2757 for (;;) { 2758 // Move all of the positions forward in the input string. 2759 p0 = p1; c0 = c1; 2760 p1 = p2; c1 = c2; 2761 p2 = p3; c2 = c3; 2762 2763 // Advancd p3 by X(Extend | Format)* Rule 4 2764 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change) 2765 do { 2766 p3 = fText->moveIndex32(p3, 1); 2767 c3 = fText->char32At(p3); 2768 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { 2769 break; 2770 }; 2771 } 2772 while (fFormatSet->contains(c3) || fExtendSet->contains(c3)); 2773 2774 2775 if (p1 == p2) { 2776 // Still warming up the loop. (won't work with zero length strings, but we don't care) 2777 continue; 2778 } 2779 if (p2 == fText->length()) { 2780 // Reached end of string. Always a break position. 2781 break; 2782 } 2783 2784 // Rule (3) CR x LF 2785 // No Extend or Format characters may appear between the CR and LF, 2786 // which requires the additional check for p2 immediately following p1. 2787 // 2788 if (c1==0x0D && c2==0x0A) { 2789 continue; 2790 } 2791 2792 // Rule (3a) Break before and after newlines (including CR and LF) 2793 // 2794 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) { 2795 break; 2796 }; 2797 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { 2798 break; 2799 }; 2800 2801 // Rule (5). ALetter x ALetter 2802 if (fALetterSet->contains(c1) && 2803 fALetterSet->contains(c2)) { 2804 continue; 2805 } 2806 2807 // Rule (6) ALetter x (MidLetter | MidNumLet) ALetter 2808 // 2809 if ( fALetterSet->contains(c1) && 2810 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2)) && 2811 fALetterSet->contains(c3)) { 2812 continue; 2813 } 2814 2815 2816 // Rule (7) ALetter (MidLetter | MidNumLet) x ALetter 2817 if (fALetterSet->contains(c0) && 2818 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1)) && 2819 fALetterSet->contains(c2)) { 2820 continue; 2821 } 2822 2823 // Rule (8) Numeric x Numeric 2824 if (fNumericSet->contains(c1) && 2825 fNumericSet->contains(c2)) { 2826 continue; 2827 } 2828 2829 // Rule (9) ALetter x Numeric 2830 if (fALetterSet->contains(c1) && 2831 fNumericSet->contains(c2)) { 2832 continue; 2833 } 2834 2835 // Rule (10) Numeric x ALetter 2836 if (fNumericSet->contains(c1) && 2837 fALetterSet->contains(c2)) { 2838 continue; 2839 } 2840 2841 // Rule (11) Numeric (MidNum | MidNumLet) x Numeric 2842 if (fNumericSet->contains(c0) && 2843 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1)) && 2844 fNumericSet->contains(c2)) { 2845 continue; 2846 } 2847 2848 // Rule (12) Numeric x (MidNum | MidNumLet) Numeric 2849 if (fNumericSet->contains(c1) && 2850 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2)) && 2851 fNumericSet->contains(c3)) { 2852 continue; 2853 } 2854 2855 // Rule (13) Katakana x Katakana 2856 if (fKatakanaSet->contains(c1) && 2857 fKatakanaSet->contains(c2)) { 2858 continue; 2859 } 2860 2861 // Rule 13a 2862 if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) || 2863 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) && 2864 fExtendNumLetSet->contains(c2)) { 2865 continue; 2866 } 2867 2868 // Rule 13b 2869 if (fExtendNumLetSet->contains(c1) && 2870 (fALetterSet->contains(c2) || fNumericSet->contains(c2) || 2871 fKatakanaSet->contains(c2))) { 2872 continue; 2873 } 2874 2875 // Rule 14. Break found here. 2876 break; 2877 } 2878 2879 breakPos = p2; 2880 return breakPos; 2881 } 2882 2883 2884 UVector *RBBIWordMonkey::charClasses() { 2885 return fSets; 2886 } 2887 2888 2889 RBBIWordMonkey::~RBBIWordMonkey() { 2890 delete fSets; 2891 delete fCRSet; 2892 delete fLFSet; 2893 delete fNewlineSet; 2894 delete fKatakanaSet; 2895 delete fALetterSet; 2896 delete fMidNumLetSet; 2897 delete fMidLetterSet; 2898 delete fMidNumSet; 2899 delete fNumericSet; 2900 delete fFormatSet; 2901 delete fExtendSet; 2902 delete fExtendNumLetSet; 2903 delete fOtherSet; 2904 } 2905 2906 2907 2908 2909 //------------------------------------------------------------------------------------------ 2910 // 2911 // class RBBISentMonkey Sentence Break specific implementation 2912 // of RBBIMonkeyKind. 2913 // 2914 //------------------------------------------------------------------------------------------ 2915 class RBBISentMonkey: public RBBIMonkeyKind { 2916 public: 2917 RBBISentMonkey(); 2918 virtual ~RBBISentMonkey(); 2919 virtual UVector *charClasses(); 2920 virtual void setText(const UnicodeString &s); 2921 virtual int32_t next(int32_t i); 2922 private: 2923 int moveBack(int posFrom); 2924 int moveForward(int posFrom); 2925 UChar32 cAt(int pos); 2926 2927 UVector *fSets; 2928 2929 UnicodeSet *fSepSet; 2930 UnicodeSet *fFormatSet; 2931 UnicodeSet *fSpSet; 2932 UnicodeSet *fLowerSet; 2933 UnicodeSet *fUpperSet; 2934 UnicodeSet *fOLetterSet; 2935 UnicodeSet *fNumericSet; 2936 UnicodeSet *fATermSet; 2937 UnicodeSet *fSContinueSet; 2938 UnicodeSet *fSTermSet; 2939 UnicodeSet *fCloseSet; 2940 UnicodeSet *fOtherSet; 2941 UnicodeSet *fExtendSet; 2942 2943 const UnicodeString *fText; 2944 2945 }; 2946 2947 RBBISentMonkey::RBBISentMonkey() 2948 { 2949 UErrorCode status = U_ZERO_ERROR; 2950 2951 fSets = new UVector(status); 2952 2953 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator 2954 // set and made into character classes of their own. For the monkey impl, 2955 // they remain in SEP, since Sep always appears with CR and LF in the rules. 2956 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status); 2957 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status); 2958 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status); 2959 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status); 2960 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status); 2961 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status); 2962 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status); 2963 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status); 2964 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status); 2965 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status); 2966 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status); 2967 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status); 2968 fOtherSet = new UnicodeSet(); 2969 2970 if(U_FAILURE(status)) { 2971 deferredStatus = status; 2972 return; 2973 } 2974 2975 fOtherSet->complement(); 2976 fOtherSet->removeAll(*fSepSet); 2977 fOtherSet->removeAll(*fFormatSet); 2978 fOtherSet->removeAll(*fSpSet); 2979 fOtherSet->removeAll(*fLowerSet); 2980 fOtherSet->removeAll(*fUpperSet); 2981 fOtherSet->removeAll(*fOLetterSet); 2982 fOtherSet->removeAll(*fNumericSet); 2983 fOtherSet->removeAll(*fATermSet); 2984 fOtherSet->removeAll(*fSContinueSet); 2985 fOtherSet->removeAll(*fSTermSet); 2986 fOtherSet->removeAll(*fCloseSet); 2987 fOtherSet->removeAll(*fExtendSet); 2988 2989 fSets->addElement(fSepSet, status); 2990 fSets->addElement(fFormatSet, status); 2991 fSets->addElement(fSpSet, status); 2992 fSets->addElement(fLowerSet, status); 2993 fSets->addElement(fUpperSet, status); 2994 fSets->addElement(fOLetterSet, status); 2995 fSets->addElement(fNumericSet, status); 2996 fSets->addElement(fATermSet, status); 2997 fSets->addElement(fSContinueSet, status); 2998 fSets->addElement(fSTermSet, status); 2999 fSets->addElement(fCloseSet, status); 3000 fSets->addElement(fOtherSet, status); 3001 fSets->addElement(fExtendSet, status); 3002 3003 if (U_FAILURE(status)) { 3004 deferredStatus = status; 3005 } 3006 } 3007 3008 3009 3010 void RBBISentMonkey::setText(const UnicodeString &s) { 3011 fText = &s; 3012 } 3013 3014 UVector *RBBISentMonkey::charClasses() { 3015 return fSets; 3016 } 3017 3018 3019 // moveBack() Find the "significant" code point preceding the index i. 3020 // Skips over ($Extend | $Format)* . 3021 // 3022 int RBBISentMonkey::moveBack(int i) { 3023 if (i <= 0) { 3024 return -1; 3025 } 3026 UChar32 c; 3027 int32_t j = i; 3028 do { 3029 j = fText->moveIndex32(j, -1); 3030 c = fText->char32At(j); 3031 } 3032 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c))); 3033 return j; 3034 3035 } 3036 3037 3038 int RBBISentMonkey::moveForward(int i) { 3039 if (i>=fText->length()) { 3040 return fText->length(); 3041 } 3042 UChar32 c; 3043 int32_t j = i; 3044 do { 3045 j = fText->moveIndex32(j, 1); 3046 c = cAt(j); 3047 } 3048 while (fFormatSet->contains(c) || fExtendSet->contains(c)); 3049 return j; 3050 } 3051 3052 UChar32 RBBISentMonkey::cAt(int pos) { 3053 if (pos<0 || pos>=fText->length()) { 3054 return -1; 3055 } else { 3056 return fText->char32At(pos); 3057 } 3058 } 3059 3060 int32_t RBBISentMonkey::next(int32_t prevPos) { 3061 int p0, p1, p2, p3; // Indices of the significant code points around the 3062 // break position being tested. The candidate break 3063 // location is before p2. 3064 3065 int breakPos = -1; 3066 3067 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 3068 UChar32 c; 3069 3070 if (U_FAILURE(deferredStatus)) { 3071 return -1; 3072 } 3073 3074 // Prev break at end of string. return DONE. 3075 if (prevPos >= fText->length()) { 3076 return -1; 3077 } 3078 p0 = p1 = p2 = p3 = prevPos; 3079 c3 = fText->char32At(prevPos); 3080 c0 = c1 = c2 = 0; 3081 3082 // Loop runs once per "significant" character position in the input text. 3083 for (;;) { 3084 // Move all of the positions forward in the input string. 3085 p0 = p1; c0 = c1; 3086 p1 = p2; c1 = c2; 3087 p2 = p3; c2 = c3; 3088 3089 // Advancd p3 by X(Extend | Format)* Rule 4 3090 p3 = moveForward(p3); 3091 c3 = cAt(p3); 3092 3093 // Rule (3) CR x LF 3094 if (c1==0x0d && c2==0x0a && p2==(p1+1)) { 3095 continue; 3096 } 3097 3098 // Rule (4). Sep <break> 3099 if (fSepSet->contains(c1)) { 3100 p2 = p1+1; // Separators don't combine with Extend or Format. 3101 break; 3102 } 3103 3104 if (p2 >= fText->length()) { 3105 // Reached end of string. Always a break position. 3106 break; 3107 } 3108 3109 if (p2 == prevPos) { 3110 // Still warming up the loop. (won't work with zero length strings, but we don't care) 3111 continue; 3112 } 3113 3114 // Rule (6). ATerm x Numeric 3115 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) { 3116 continue; 3117 } 3118 3119 // Rule (7). Upper ATerm x Uppper 3120 if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) { 3121 continue; 3122 } 3123 3124 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower 3125 // Note: STerm | ATerm are added to the negated part of the expression by a 3126 // note to the Unicode 5.0 documents. 3127 int p8 = p1; 3128 while (fSpSet->contains(cAt(p8))) { 3129 p8 = moveBack(p8); 3130 } 3131 while (fCloseSet->contains(cAt(p8))) { 3132 p8 = moveBack(p8); 3133 } 3134 if (fATermSet->contains(cAt(p8))) { 3135 p8=p2; 3136 for (;;) { 3137 c = cAt(p8); 3138 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) || 3139 fLowerSet->contains(c) || fSepSet->contains(c) || 3140 fATermSet->contains(c) || fSTermSet->contains(c)) { 3141 break; 3142 } 3143 p8 = moveForward(p8); 3144 } 3145 if (fLowerSet->contains(cAt(p8))) { 3146 continue; 3147 } 3148 } 3149 3150 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm); 3151 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) { 3152 p8 = p1; 3153 while (fSpSet->contains(cAt(p8))) { 3154 p8 = moveBack(p8); 3155 } 3156 while (fCloseSet->contains(cAt(p8))) { 3157 p8 = moveBack(p8); 3158 } 3159 c = cAt(p8); 3160 if (fSTermSet->contains(c) || fATermSet->contains(c)) { 3161 continue; 3162 } 3163 } 3164 3165 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF) 3166 int p9 = p1; 3167 while (fCloseSet->contains(cAt(p9))) { 3168 p9 = moveBack(p9); 3169 } 3170 c = cAt(p9); 3171 if ((fSTermSet->contains(c) || fATermSet->contains(c))) { 3172 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) { 3173 continue; 3174 } 3175 } 3176 3177 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF) 3178 int p10 = p1; 3179 while (fSpSet->contains(cAt(p10))) { 3180 p10 = moveBack(p10); 3181 } 3182 while (fCloseSet->contains(cAt(p10))) { 3183 p10 = moveBack(p10); 3184 } 3185 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) { 3186 if (fSpSet->contains(c2) || fSepSet->contains(c2)) { 3187 continue; 3188 } 3189 } 3190 3191 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break> 3192 int p11 = p1; 3193 if (fSepSet->contains(cAt(p11))) { 3194 p11 = moveBack(p11); 3195 } 3196 while (fSpSet->contains(cAt(p11))) { 3197 p11 = moveBack(p11); 3198 } 3199 while (fCloseSet->contains(cAt(p11))) { 3200 p11 = moveBack(p11); 3201 } 3202 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) { 3203 break; 3204 } 3205 3206 // Rule (12) Any x Any 3207 continue; 3208 } 3209 breakPos = p2; 3210 return breakPos; 3211 } 3212 3213 RBBISentMonkey::~RBBISentMonkey() { 3214 delete fSets; 3215 delete fSepSet; 3216 delete fFormatSet; 3217 delete fSpSet; 3218 delete fLowerSet; 3219 delete fUpperSet; 3220 delete fOLetterSet; 3221 delete fNumericSet; 3222 delete fATermSet; 3223 delete fSContinueSet; 3224 delete fSTermSet; 3225 delete fCloseSet; 3226 delete fOtherSet; 3227 delete fExtendSet; 3228 } 3229 3230 3231 3232 //------------------------------------------------------------------------------------------- 3233 // 3234 // RBBILineMonkey 3235 // 3236 //------------------------------------------------------------------------------------------- 3237 3238 class RBBILineMonkey: public RBBIMonkeyKind { 3239 public: 3240 RBBILineMonkey(); 3241 virtual ~RBBILineMonkey(); 3242 virtual UVector *charClasses(); 3243 virtual void setText(const UnicodeString &s); 3244 virtual int32_t next(int32_t i); 3245 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar); 3246 private: 3247 UVector *fSets; 3248 3249 UnicodeSet *fBK; 3250 UnicodeSet *fCR; 3251 UnicodeSet *fLF; 3252 UnicodeSet *fCM; 3253 UnicodeSet *fNL; 3254 UnicodeSet *fSG; 3255 UnicodeSet *fWJ; 3256 UnicodeSet *fZW; 3257 UnicodeSet *fGL; 3258 UnicodeSet *fCB; 3259 UnicodeSet *fSP; 3260 UnicodeSet *fB2; 3261 UnicodeSet *fBA; 3262 UnicodeSet *fBB; 3263 UnicodeSet *fHY; 3264 UnicodeSet *fH2; 3265 UnicodeSet *fH3; 3266 UnicodeSet *fCL; 3267 UnicodeSet *fCP; 3268 UnicodeSet *fEX; 3269 UnicodeSet *fIN; 3270 UnicodeSet *fJL; 3271 UnicodeSet *fJV; 3272 UnicodeSet *fJT; 3273 UnicodeSet *fNS; 3274 UnicodeSet *fOP; 3275 UnicodeSet *fQU; 3276 UnicodeSet *fIS; 3277 UnicodeSet *fNU; 3278 UnicodeSet *fPO; 3279 UnicodeSet *fPR; 3280 UnicodeSet *fSY; 3281 UnicodeSet *fAI; 3282 UnicodeSet *fAL; 3283 UnicodeSet *fID; 3284 UnicodeSet *fSA; 3285 UnicodeSet *fXX; 3286 3287 BreakIterator *fCharBI; 3288 3289 const UnicodeString *fText; 3290 int32_t *fOrigPositions; 3291 3292 RegexMatcher *fNumberMatcher; 3293 RegexMatcher *fLB11Matcher; 3294 }; 3295 3296 3297 RBBILineMonkey::RBBILineMonkey() 3298 { 3299 UErrorCode status = U_ZERO_ERROR; 3300 3301 fSets = new UVector(status); 3302 3303 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status); 3304 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status); 3305 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status); 3306 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status); 3307 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status); 3308 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status); 3309 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status); 3310 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status); 3311 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status); 3312 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status); 3313 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status); 3314 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status); 3315 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status); 3316 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status); 3317 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status); 3318 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status); 3319 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status); 3320 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status); 3321 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status); 3322 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status); 3323 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status); 3324 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status); 3325 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status); 3326 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status); 3327 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status); 3328 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status); 3329 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status); 3330 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status); 3331 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status); 3332 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status); 3333 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status); 3334 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status); 3335 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status); 3336 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status); 3337 fSA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status); 3338 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status); 3339 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status); 3340 3341 if (U_FAILURE(status)) { 3342 deferredStatus = status; 3343 fCharBI = NULL; 3344 fNumberMatcher = NULL; 3345 return; 3346 } 3347 3348 fAL->addAll(*fXX); // Default behavior for XX is identical to AL 3349 fAL->addAll(*fAI); // Default behavior for AI is identical to AL 3350 fAL->addAll(*fSA); // Default behavior for SA is XX, which defaults to AL 3351 fAL->addAll(*fSG); // Default behavior for SG is identical to AL. 3352 3353 fSets->addElement(fBK, status); 3354 fSets->addElement(fCR, status); 3355 fSets->addElement(fLF, status); 3356 fSets->addElement(fCM, status); 3357 fSets->addElement(fNL, status); 3358 fSets->addElement(fWJ, status); 3359 fSets->addElement(fZW, status); 3360 fSets->addElement(fGL, status); 3361 fSets->addElement(fCB, status); 3362 fSets->addElement(fSP, status); 3363 fSets->addElement(fB2, status); 3364 fSets->addElement(fBA, status); 3365 fSets->addElement(fBB, status); 3366 fSets->addElement(fHY, status); 3367 fSets->addElement(fH2, status); 3368 fSets->addElement(fH3, status); 3369 fSets->addElement(fCL, status); 3370 fSets->addElement(fCP, status); 3371 fSets->addElement(fEX, status); 3372 fSets->addElement(fIN, status); 3373 fSets->addElement(fJL, status); 3374 fSets->addElement(fJT, status); 3375 fSets->addElement(fJV, status); 3376 fSets->addElement(fNS, status); 3377 fSets->addElement(fOP, status); 3378 fSets->addElement(fQU, status); 3379 fSets->addElement(fIS, status); 3380 fSets->addElement(fNU, status); 3381 fSets->addElement(fPO, status); 3382 fSets->addElement(fPR, status); 3383 fSets->addElement(fSY, status); 3384 fSets->addElement(fAI, status); 3385 fSets->addElement(fAL, status); 3386 fSets->addElement(fID, status); 3387 fSets->addElement(fWJ, status); 3388 fSets->addElement(fSA, status); 3389 fSets->addElement(fSG, status); 3390 3391 const char *rules = 3392 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?" 3393 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?" 3394 "\\p{Line_Break=NU}\\p{Line_Break=CM}*" 3395 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*" 3396 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?" 3397 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"; 3398 3399 fNumberMatcher = new RegexMatcher( 3400 UnicodeString(rules, -1, US_INV), 0, status); 3401 3402 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status); 3403 3404 if (U_FAILURE(status)) { 3405 deferredStatus = status; 3406 } 3407 } 3408 3409 3410 void RBBILineMonkey::setText(const UnicodeString &s) { 3411 fText = &s; 3412 fCharBI->setText(s); 3413 fNumberMatcher->reset(s); 3414 } 3415 3416 // 3417 // rule9Adjust 3418 // Line Break TR rules 9 and 10 implementation. 3419 // This deals with combining marks and other sequences that 3420 // that must be treated as if they were something other than what they actually are. 3421 // 3422 // This is factored out into a separate function because it must be applied twice for 3423 // each potential break, once to the chars before the position being checked, then 3424 // again to the text following the possible break. 3425 // 3426 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) { 3427 if (pos == -1) { 3428 // Invalid initial position. Happens during the warmup iteration of the 3429 // main loop in next(). 3430 return; 3431 } 3432 3433 int32_t nPos = *nextPos; 3434 3435 // LB 9 Keep combining sequences together. 3436 // advance over any CM class chars. Note that Line Break CM is different 3437 // from the normal Grapheme Extend property. 3438 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d || 3439 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) { 3440 for (;;) { 3441 *nextChar = fText->char32At(nPos); 3442 if (!fCM->contains(*nextChar)) { 3443 break; 3444 } 3445 nPos = fText->moveIndex32(nPos, 1); 3446 } 3447 } 3448 3449 3450 // LB 9 Treat X CM* as if it were x. 3451 // No explicit action required. 3452 3453 // LB 10 Treat any remaining combining mark as AL 3454 if (fCM->contains(*posChar)) { 3455 *posChar = 0x41; // thisChar = 'A'; 3456 } 3457 3458 // Push the updated nextPos and nextChar back to our caller. 3459 // This only makes a difference if posChar got bigger by consuming a 3460 // combining sequence. 3461 *nextPos = nPos; 3462 *nextChar = fText->char32At(nPos); 3463 } 3464 3465 3466 3467 int32_t RBBILineMonkey::next(int32_t startPos) { 3468 UErrorCode status = U_ZERO_ERROR; 3469 int32_t pos; // Index of the char following a potential break position 3470 UChar32 thisChar; // Character at above position "pos" 3471 3472 int32_t prevPos; // Index of the char preceding a potential break position 3473 UChar32 prevChar; // Character at above position. Note that prevChar 3474 // and thisChar may not be adjacent because combining 3475 // characters between them will be ignored. 3476 3477 int32_t nextPos; // Index of the next character following pos. 3478 // Usually skips over combining marks. 3479 int32_t nextCPPos; // Index of the code point following "pos." 3480 // May point to a combining mark. 3481 int32_t tPos; // temp value. 3482 UChar32 c; 3483 3484 if (U_FAILURE(deferredStatus)) { 3485 return -1; 3486 } 3487 3488 if (startPos >= fText->length()) { 3489 return -1; 3490 } 3491 3492 3493 // Initial values for loop. Loop will run the first time without finding breaks, 3494 // while the invalid values shift out and the "this" and 3495 // "prev" positions are filled in with good values. 3496 pos = prevPos = -1; // Invalid value, serves as flag for initial loop iteration. 3497 thisChar = prevChar = 0; 3498 nextPos = nextCPPos = startPos; 3499 3500 3501 // Loop runs once per position in the test text, until a break position 3502 // is found. 3503 for (;;) { 3504 prevPos = pos; 3505 prevChar = thisChar; 3506 3507 pos = nextPos; 3508 thisChar = fText->char32At(pos); 3509 3510 nextCPPos = fText->moveIndex32(pos, 1); 3511 nextPos = nextCPPos; 3512 3513 // Rule LB2 - Break at end of text. 3514 if (pos >= fText->length()) { 3515 break; 3516 } 3517 3518 // Rule LB 9 - adjust for combining sequences. 3519 // We do this one out-of-order because the adjustment does not change anything 3520 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to 3521 // be applied. 3522 rule9Adjust(prevPos, &prevChar, &pos, &thisChar); 3523 nextCPPos = nextPos = fText->moveIndex32(pos, 1); 3524 c = fText->char32At(nextPos); 3525 rule9Adjust(pos, &thisChar, &nextPos, &c); 3526 3527 // If the loop is still warming up - if we haven't shifted the initial 3528 // -1 positions out of prevPos yet - loop back to advance the 3529 // position in the input without any further looking for breaks. 3530 if (prevPos == -1) { 3531 continue; 3532 } 3533 3534 // LB 4 Always break after hard line breaks, 3535 if (fBK->contains(prevChar)) { 3536 break; 3537 } 3538 3539 // LB 5 Break after CR, LF, NL, but not inside CR LF 3540 if (prevChar == 0x0d && thisChar == 0x0a) { 3541 continue; 3542 } 3543 if (prevChar == 0x0d || 3544 prevChar == 0x0a || 3545 prevChar == 0x85) { 3546 break; 3547 } 3548 3549 // LB 6 Don't break before hard line breaks 3550 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 || 3551 fBK->contains(thisChar)) { 3552 continue; 3553 } 3554 3555 3556 // LB 7 Don't break before spaces or zero-width space. 3557 if (fSP->contains(thisChar)) { 3558 continue; 3559 } 3560 3561 if (fZW->contains(thisChar)) { 3562 continue; 3563 } 3564 3565 // LB 8 Break after zero width space 3566 if (fZW->contains(prevChar)) { 3567 break; 3568 } 3569 3570 // LB 9, 10 Already done, at top of loop. 3571 // 3572 3573 3574 // LB 11 Do not break before or after WORD JOINER and related characters. 3575 // x WJ 3576 // WJ x 3577 // 3578 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) { 3579 continue; 3580 } 3581 3582 // LB 12 3583 // GL x 3584 if (fGL->contains(prevChar)) { 3585 continue; 3586 } 3587 3588 // LB 12a 3589 // [^SP BA HY] x GL 3590 if (!(fSP->contains(prevChar) || 3591 fBA->contains(prevChar) || 3592 fHY->contains(prevChar) ) && fGL->contains(thisChar)) { 3593 continue; 3594 } 3595 3596 3597 3598 // LB 13 Don't break before closings. 3599 // NU x CL, NU x CP and NU x IS are not matched here so that they will 3600 // fall into LB 17 and the more general number regular expression. 3601 // 3602 if (!fNU->contains(prevChar) && fCL->contains(thisChar) || 3603 !fNU->contains(prevChar) && fCP->contains(thisChar) || 3604 fEX->contains(thisChar) || 3605 !fNU->contains(prevChar) && fIS->contains(thisChar) || 3606 !fNU->contains(prevChar) && fSY->contains(thisChar)) { 3607 continue; 3608 } 3609 3610 // LB 14 Don't break after OP SP* 3611 // Scan backwards, checking for this sequence. 3612 // The OP char could include combining marks, so we actually check for 3613 // OP CM* SP* 3614 // Another Twist: The Rule 67 fixes may have changed a SP CM 3615 // sequence into a ID char, so before scanning back through spaces, 3616 // verify that prevChar is indeed a space. The prevChar variable 3617 // may differ from fText[prevPos] 3618 tPos = prevPos; 3619 if (fSP->contains(prevChar)) { 3620 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) { 3621 tPos=fText->moveIndex32(tPos, -1); 3622 } 3623 } 3624 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { 3625 tPos=fText->moveIndex32(tPos, -1); 3626 } 3627 if (fOP->contains(fText->char32At(tPos))) { 3628 continue; 3629 } 3630 3631 3632 // LB 15 QU SP* x OP 3633 if (fOP->contains(thisChar)) { 3634 // Scan backwards from prevChar to see if it is preceded by QU CM* SP* 3635 int tPos = prevPos; 3636 while (tPos>0 && fSP->contains(fText->char32At(tPos))) { 3637 tPos = fText->moveIndex32(tPos, -1); 3638 } 3639 while (tPos>0 && fCM->contains(fText->char32At(tPos))) { 3640 tPos = fText->moveIndex32(tPos, -1); 3641 } 3642 if (fQU->contains(fText->char32At(tPos))) { 3643 continue; 3644 } 3645 } 3646 3647 3648 3649 // LB 16 (CL | CP) SP* x NS 3650 // Scan backwards for SP* CM* (CL | CP) 3651 if (fNS->contains(thisChar)) { 3652 int tPos = prevPos; 3653 while (tPos>0 && fSP->contains(fText->char32At(tPos))) { 3654 tPos = fText->moveIndex32(tPos, -1); 3655 } 3656 while (tPos>0 && fCM->contains(fText->char32At(tPos))) { 3657 tPos = fText->moveIndex32(tPos, -1); 3658 } 3659 if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) { 3660 continue; 3661 } 3662 } 3663 3664 3665 // LB 17 B2 SP* x B2 3666 if (fB2->contains(thisChar)) { 3667 // Scan backwards, checking for the B2 CM* SP* sequence. 3668 tPos = prevPos; 3669 if (fSP->contains(prevChar)) { 3670 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) { 3671 tPos=fText->moveIndex32(tPos, -1); 3672 } 3673 } 3674 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { 3675 tPos=fText->moveIndex32(tPos, -1); 3676 } 3677 if (fB2->contains(fText->char32At(tPos))) { 3678 continue; 3679 } 3680 } 3681 3682 3683 // LB 18 break after space 3684 if (fSP->contains(prevChar)) { 3685 break; 3686 } 3687 3688 // LB 19 3689 // x QU 3690 // QU x 3691 if (fQU->contains(thisChar) || fQU->contains(prevChar)) { 3692 continue; 3693 } 3694 3695 // LB 20 Break around a CB 3696 if (fCB->contains(thisChar) || fCB->contains(prevChar)) { 3697 break; 3698 } 3699 3700 // LB 21 3701 if (fBA->contains(thisChar) || 3702 fHY->contains(thisChar) || 3703 fNS->contains(thisChar) || 3704 fBB->contains(prevChar) ) { 3705 continue; 3706 } 3707 3708 // LB 22 3709 if (fAL->contains(prevChar) && fIN->contains(thisChar) || 3710 fID->contains(prevChar) && fIN->contains(thisChar) || 3711 fIN->contains(prevChar) && fIN->contains(thisChar) || 3712 fNU->contains(prevChar) && fIN->contains(thisChar) ) { 3713 continue; 3714 } 3715 3716 3717 // LB 23 ID x PO 3718 // AL x NU 3719 // NU x AL 3720 if (fID->contains(prevChar) && fPO->contains(thisChar) || 3721 fAL->contains(prevChar) && fNU->contains(thisChar) || 3722 fNU->contains(prevChar) && fAL->contains(thisChar) ) { 3723 continue; 3724 } 3725 3726 // LB 24 Do not break between prefix and letters or ideographs. 3727 // PR x ID 3728 // PR x AL 3729 // PO x AL 3730 if (fPR->contains(prevChar) && fID->contains(thisChar) || 3731 fPR->contains(prevChar) && fAL->contains(thisChar) || 3732 fPO->contains(prevChar) && fAL->contains(thisChar) ) { 3733 continue; 3734 } 3735 3736 3737 3738 // LB 25 Numbers 3739 if (fNumberMatcher->lookingAt(prevPos, status)) { 3740 if (U_FAILURE(status)) { 3741 break; 3742 } 3743 // Matched a number. But could have been just a single digit, which would 3744 // not represent a "no break here" between prevChar and thisChar 3745 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num 3746 if (numEndIdx > pos) { 3747 // Number match includes at least our two chars being checked 3748 if (numEndIdx > nextPos) { 3749 // Number match includes additional chars. Update pos and nextPos 3750 // so that next loop iteration will continue at the end of the number, 3751 // checking for breaks between last char in number & whatever follows. 3752 pos = nextPos = numEndIdx; 3753 do { 3754 pos = fText->moveIndex32(pos, -1); 3755 thisChar = fText->char32At(pos); 3756 } while (fCM->contains(thisChar)); 3757 } 3758 continue; 3759 } 3760 } 3761 3762 3763 // LB 26 Do not break a Korean syllable. 3764 if (fJL->contains(prevChar) && (fJL->contains(thisChar) || 3765 fJV->contains(thisChar) || 3766 fH2->contains(thisChar) || 3767 fH3->contains(thisChar))) { 3768 continue; 3769 } 3770 3771 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) && 3772 (fJV->contains(thisChar) || fJT->contains(thisChar))) { 3773 continue; 3774 } 3775 3776 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) && 3777 fJT->contains(thisChar)) { 3778 continue; 3779 } 3780 3781 // LB 27 Treat a Korean Syllable Block the same as ID. 3782 if ((fJL->contains(prevChar) || fJV->contains(prevChar) || 3783 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) && 3784 fIN->contains(thisChar)) { 3785 continue; 3786 } 3787 if ((fJL->contains(prevChar) || fJV->contains(prevChar) || 3788 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) && 3789 fPO->contains(thisChar)) { 3790 continue; 3791 } 3792 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) || 3793 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) { 3794 continue; 3795 } 3796 3797 3798 3799 // LB 28 Do not break between alphabetics ("at"). 3800 if (fAL->contains(prevChar) && fAL->contains(thisChar)) { 3801 continue; 3802 } 3803 3804 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g."). 3805 if (fIS->contains(prevChar) && fAL->contains(thisChar)) { 3806 continue; 3807 } 3808 3809 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation. 3810 // (AL | NU) x OP 3811 // CP x (AL | NU) 3812 if ((fAL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) { 3813 continue; 3814 } 3815 if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fNU->contains(thisChar))) { 3816 continue; 3817 } 3818 3819 // LB 31 Break everywhere else 3820 break; 3821 3822 } 3823 3824 return pos; 3825 } 3826 3827 3828 UVector *RBBILineMonkey::charClasses() { 3829 return fSets; 3830 } 3831 3832 3833 RBBILineMonkey::~RBBILineMonkey() { 3834 delete fSets; 3835 3836 delete fBK; 3837 delete fCR; 3838 delete fLF; 3839 delete fCM; 3840 delete fNL; 3841 delete fWJ; 3842 delete fZW; 3843 delete fGL; 3844 delete fCB; 3845 delete fSP; 3846 delete fB2; 3847 delete fBA; 3848 delete fBB; 3849 delete fHY; 3850 delete fH2; 3851 delete fH3; 3852 delete fCL; 3853 delete fCP; 3854 delete fEX; 3855 delete fIN; 3856 delete fJL; 3857 delete fJV; 3858 delete fJT; 3859 delete fNS; 3860 delete fOP; 3861 delete fQU; 3862 delete fIS; 3863 delete fNU; 3864 delete fPO; 3865 delete fPR; 3866 delete fSY; 3867 delete fAI; 3868 delete fAL; 3869 delete fID; 3870 delete fSA; 3871 delete fSG; 3872 delete fXX; 3873 3874 delete fCharBI; 3875 delete fNumberMatcher; 3876 } 3877 3878 3879 //------------------------------------------------------------------------------------------- 3880 // 3881 // TestMonkey 3882 // 3883 // params 3884 // seed=nnnnn Random number starting seed. 3885 // Setting the seed allows errors to be reproduced. 3886 // loop=nnn Looping count. Controls running time. 3887 // -1: run forever. 3888 // 0 or greater: run length. 3889 // 3890 // type = char | word | line | sent | title 3891 // 3892 //------------------------------------------------------------------------------------------- 3893 3894 static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) { 3895 int32_t val = defaultVal; 3896 name.append(" *= *(-?\\d+)"); 3897 UErrorCode status = U_ZERO_ERROR; 3898 RegexMatcher m(name, params, 0, status); 3899 if (m.find()) { 3900 // The param exists. Convert the string to an int. 3901 char valString[100]; 3902 int32_t paramLength = m.end(1, status) - m.start(1, status); 3903 if (paramLength >= (int32_t)(sizeof(valString)-1)) { 3904 paramLength = (int32_t)(sizeof(valString)-2); 3905 } 3906 params.extract(m.start(1, status), paramLength, valString, sizeof(valString)); 3907 val = strtol(valString, NULL, 10); 3908 3909 // Delete this parameter from the params string. 3910 m.reset(); 3911 params = m.replaceFirst("", status); 3912 } 3913 U_ASSERT(U_SUCCESS(status)); 3914 return val; 3915 } 3916 #endif 3917 3918 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr, 3919 BreakIterator *bi, 3920 int expected[], 3921 int expectedcount) 3922 { 3923 int count = 0; 3924 int i = 0; 3925 int forward[50]; 3926 bi->setText(ustr); 3927 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) { 3928 forward[count] = i; 3929 if (count < expectedcount && expected[count] != i) { 3930 test->errln("break forward test failed: expected %d but got %d", 3931 expected[count], i); 3932 break; 3933 } 3934 count ++; 3935 } 3936 if (count != expectedcount) { 3937 printStringBreaks(ustr, expected, expectedcount); 3938 test->errln("break forward test failed: missed %d match", 3939 expectedcount - count); 3940 return; 3941 } 3942 // testing boundaries 3943 for (i = 1; i < expectedcount; i ++) { 3944 int j = expected[i - 1]; 3945 if (!bi->isBoundary(j)) { 3946 printStringBreaks(ustr, expected, expectedcount); 3947 test->errln("isBoundary() failed. Expected boundary at position %d", j); 3948 return; 3949 } 3950 for (j = expected[i - 1] + 1; j < expected[i]; j ++) { 3951 if (bi->isBoundary(j)) { 3952 printStringBreaks(ustr, expected, expectedcount); 3953 test->errln("isBoundary() failed. Not expecting boundary at position %d", j); 3954 return; 3955 } 3956 } 3957 } 3958 3959 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) { 3960 count --; 3961 if (forward[count] != i) { 3962 test->errln("happy break test previous() failed: expected %d but got %d", 3963 forward[count], i); 3964 break; 3965 } 3966 } 3967 if (count != 0) { 3968 printStringBreaks(ustr, expected, expectedcount); 3969 test->errln("break test previous() failed: missed a match"); 3970 return; 3971 } 3972 3973 // testing preceding 3974 for (i = 0; i < expectedcount - 1; i ++) { 3975 // int j = expected[i] + 1; 3976 int j = ustr.moveIndex32(expected[i], 1); 3977 for (; j <= expected[i + 1]; j ++) { 3978 if (bi->preceding(j) != expected[i]) { 3979 printStringBreaks(ustr, expected, expectedcount); 3980 test->errln("preceding(): Not expecting boundary at position %d", j); 3981 return; 3982 } 3983 } 3984 } 3985 } 3986 3987 void RBBITest::TestWordBreaks(void) 3988 { 3989 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 3990 3991 Locale locale("en"); 3992 UErrorCode status = U_ZERO_ERROR; 3993 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 3994 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 3995 static const char *strlist[] = 3996 { 3997 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d", 3998 "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b", 3999 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a", 4000 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622", 4001 "\\u90ca\\u3588\\u009c\\u0953\\u194b", 4002 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 4003 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e", 4004 "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e", 4005 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 4006 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 4007 "\\u2027\\U000e0067\\u0a47\\u00b7", 4008 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 4009 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 4010 "\\u0589\\U000e006e\\u0a42\\U000104a5", 4011 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a", 4012 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 4013 "\\u0027\\u11af\\U000e0057\\u0602", 4014 "\\U0001d7f2\\U000e007\\u0004\\u0589", 4015 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b", 4016 "\\U0001d7f2\\U000e007d\\u0004\\u0589", 4017 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 4018 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 4019 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068", 4020 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 4021 "\\u0233\\U000e0020\\u0a69\\u0d6a", 4022 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 4023 "\\u58f4\\U000e0049\\u20e7\\u2027", 4024 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 4025 "\\ua183\\u102d\\u0bec\\u003a", 4026 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 4027 "\\u003a\\u0e57\\u0fad\\u002e", 4028 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", 4029 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", 4030 "\\U000e005d\\u2044\\u0731\\u0650\\u0061", 4031 "\\u003a\\u0664\\u00b7\\u1fba", 4032 "\\u003b\\u0027\\u00b7\\u47a3", 4033 "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b", 4034 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673", 4035 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c", 4036 }; 4037 int loop; 4038 if (U_FAILURE(status)) { 4039 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 4040 return; 4041 } 4042 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 4043 // printf("looping %d\n", loop); 4044 UnicodeString ustr = CharsToUnicodeString(strlist[loop]); 4045 // RBBICharMonkey monkey; 4046 RBBIWordMonkey monkey; 4047 4048 int expected[50]; 4049 int expectedcount = 0; 4050 4051 monkey.setText(ustr); 4052 int i; 4053 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 4054 expected[expectedcount ++] = i; 4055 } 4056 4057 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 4058 } 4059 delete bi; 4060 #endif 4061 } 4062 4063 void RBBITest::TestWordBoundary(void) 4064 { 4065 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data> 4066 Locale locale("en"); 4067 UErrorCode status = U_ZERO_ERROR; 4068 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 4069 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 4070 UChar str[50]; 4071 static const char *strlist[] = 4072 { 4073 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 4074 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 4075 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 4076 "\\u2027\\U000e0067\\u0a47\\u00b7", 4077 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 4078 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 4079 "\\u0589\\U000e006e\\u0a42\\U000104a5", 4080 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a", 4081 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 4082 "\\u0027\\u11af\\U000e0057\\u0602", 4083 "\\U0001d7f2\\U000e007\\u0004\\u0589", 4084 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b", 4085 "\\U0001d7f2\\U000e007d\\u0004\\u0589", 4086 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 4087 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 4088 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068", 4089 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 4090 "\\u0233\\U000e0020\\u0a69\\u0d6a", 4091 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 4092 "\\u58f4\\U000e0049\\u20e7\\u2027", 4093 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 4094 "\\ua183\\u102d\\u0bec\\u003a", 4095 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 4096 "\\u003a\\u0e57\\u0fad\\u002e", 4097 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", 4098 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", 4099 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019", 4100 "\\u003a\\u0664\\u00b7\\u1fba", 4101 "\\u003b\\u0027\\u00b7\\u47a3", 4102 }; 4103 int loop; 4104 if (U_FAILURE(status)) { 4105 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 4106 return; 4107 } 4108 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 4109 // printf("looping %d\n", loop); 4110 u_unescape(strlist[loop], str, 20); 4111 UnicodeString ustr(str); 4112 int forward[50]; 4113 int count = 0; 4114 4115 bi->setText(ustr); 4116 int prev = 0; 4117 int i; 4118 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) { 4119 forward[count ++] = i; 4120 if (i > prev) { 4121 int j; 4122 for (j = prev + 1; j < i; j ++) { 4123 if (bi->isBoundary(j)) { 4124 printStringBreaks(ustr, forward, count); 4125 errln("happy boundary test failed: expected %d not a boundary", 4126 j); 4127 return; 4128 } 4129 } 4130 } 4131 if (!bi->isBoundary(i)) { 4132 printStringBreaks(ustr, forward, count); 4133 errln("happy boundary test failed: expected %d a boundary", 4134 i); 4135 return; 4136 } 4137 prev = i; 4138 } 4139 } 4140 delete bi; 4141 } 4142 4143 void RBBITest::TestLineBreaks(void) 4144 { 4145 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 4146 Locale locale("en"); 4147 UErrorCode status = U_ZERO_ERROR; 4148 BreakIterator *bi = BreakIterator::createLineInstance(locale, status); 4149 const int32_t STRSIZE = 50; 4150 UChar str[STRSIZE]; 4151 static const char *strlist[] = 4152 { 4153 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc", 4154 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\" 4155 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d", 4156 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\" 4157 "u2014\\U000e0105\\u118c\\u000a\\u07f8", 4158 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f", 4159 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", 4160 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4", 4161 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", 4162 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060", 4163 "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5", 4164 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f", 4165 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1", 4166 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5", 4167 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0", 4168 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc", 4169 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f", 4170 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f", 4171 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b", 4172 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085", 4173 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac", 4174 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9", 4175 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025", 4176 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763", 4177 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029", 4178 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7", 4179 "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc", 4180 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a", 4181 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945", 4182 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014", 4183 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b", 4184 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0", 4185 "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025", 4186 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d", 4187 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111", 4188 "\\u2014\\u0020\\u000a\\u17c5\\u24fc", 4189 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f", 4190 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010", 4191 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43", 4192 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb", 4193 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc", 4194 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060", 4195 "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d" 4196 "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5" 4197 "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b", 4198 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0", 4199 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07", 4200 }; 4201 int loop; 4202 TEST_ASSERT_SUCCESS(status); 4203 if (U_FAILURE(status)) { 4204 return; 4205 } 4206 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 4207 // printf("looping %d\n", loop); 4208 int32_t t = u_unescape(strlist[loop], str, STRSIZE); 4209 if (t >= STRSIZE) { 4210 TEST_ASSERT(FALSE); 4211 continue; 4212 } 4213 4214 4215 UnicodeString ustr(str); 4216 RBBILineMonkey monkey; 4217 if (U_FAILURE(monkey.deferredStatus)) { 4218 continue; 4219 } 4220 4221 const int EXPECTEDSIZE = 50; 4222 int expected[EXPECTEDSIZE]; 4223 int expectedcount = 0; 4224 4225 monkey.setText(ustr); 4226 int i; 4227 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 4228 if (expectedcount >= EXPECTEDSIZE) { 4229 TEST_ASSERT(expectedcount < EXPECTEDSIZE); 4230 return; 4231 } 4232 expected[expectedcount ++] = i; 4233 } 4234 4235 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 4236 } 4237 delete bi; 4238 #endif 4239 } 4240 4241 void RBBITest::TestSentBreaks(void) 4242 { 4243 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 4244 Locale locale("en"); 4245 UErrorCode status = U_ZERO_ERROR; 4246 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status); 4247 UChar str[200]; 4248 static const char *strlist[] = 4249 { 4250 "Now\ris\nthe\r\ntime\n\rfor\r\r", 4251 "This\n", 4252 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.", 4253 "\"Sentence ending with a quote.\" Bye.", 4254 " (This is it). Testing the sentence iterator. \"This isn't it.\"", 4255 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"", 4256 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ", 4257 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ", 4258 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ", 4259 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!", 4260 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52" 4261 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a" 4262 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f" 4263 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352", 4264 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171" 4265 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030" 4266 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b" 4267 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b" 4268 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05" 4269 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4" 4270 }; 4271 int loop; 4272 if (U_FAILURE(status)) { 4273 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 4274 return; 4275 } 4276 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 4277 u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0]))); 4278 UnicodeString ustr(str); 4279 4280 RBBISentMonkey monkey; 4281 if (U_FAILURE(monkey.deferredStatus)) { 4282 continue; 4283 } 4284 4285 const int EXPECTEDSIZE = 50; 4286 int expected[EXPECTEDSIZE]; 4287 int expectedcount = 0; 4288 4289 monkey.setText(ustr); 4290 int i; 4291 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 4292 if (expectedcount >= EXPECTEDSIZE) { 4293 TEST_ASSERT(expectedcount < EXPECTEDSIZE); 4294 return; 4295 } 4296 expected[expectedcount ++] = i; 4297 } 4298 4299 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 4300 } 4301 delete bi; 4302 #endif 4303 } 4304 4305 void RBBITest::TestMonkey(char *params) { 4306 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 4307 4308 UErrorCode status = U_ZERO_ERROR; 4309 int32_t loopCount = 500; 4310 int32_t seed = 1; 4311 UnicodeString breakType = "all"; 4312 Locale locale("en"); 4313 UBool useUText = FALSE; 4314 4315 if (quick == FALSE) { 4316 loopCount = 10000; 4317 } 4318 4319 if (params) { 4320 UnicodeString p(params); 4321 loopCount = getIntParam("loop", p, loopCount); 4322 seed = getIntParam("seed", p, seed); 4323 4324 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status); 4325 if (m.find()) { 4326 breakType = m.group(1, status); 4327 m.reset(); 4328 p = m.replaceFirst("", status); 4329 } 4330 4331 RegexMatcher u(" *utext", p, 0, status); 4332 if (u.find()) { 4333 useUText = TRUE; 4334 u.reset(); 4335 p = u.replaceFirst("", status); 4336 } 4337 4338 4339 // m.reset(p); 4340 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) { 4341 // Each option is stripped out of the option string as it is processed. 4342 // All options have been checked. The option string should have been completely emptied.. 4343 char buf[100]; 4344 p.extract(buf, sizeof(buf), NULL, status); 4345 buf[sizeof(buf)-1] = 0; 4346 errln("Unrecognized or extra parameter: %s\n", buf); 4347 return; 4348 } 4349 4350 } 4351 4352 if (breakType == "char" || breakType == "all") { 4353 RBBICharMonkey m; 4354 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 4355 if (U_SUCCESS(status)) { 4356 RunMonkey(bi, m, "char", seed, loopCount, useUText); 4357 if (breakType == "all" && useUText==FALSE) { 4358 // Also run a quick test with UText when "all" is specified 4359 RunMonkey(bi, m, "char", seed, loopCount, TRUE); 4360 } 4361 } 4362 else { 4363 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status)); 4364 } 4365 delete bi; 4366 } 4367 4368 if (breakType == "word" || breakType == "all") { 4369 logln("Word Break Monkey Test"); 4370 RBBIWordMonkey m; 4371 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 4372 if (U_SUCCESS(status)) { 4373 RunMonkey(bi, m, "word", seed, loopCount, useUText); 4374 } 4375 else { 4376 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status)); 4377 } 4378 delete bi; 4379 } 4380 4381 if (breakType == "line" || breakType == "all") { 4382 logln("Line Break Monkey Test"); 4383 RBBILineMonkey m; 4384 BreakIterator *bi = BreakIterator::createLineInstance(locale, status); 4385 if (loopCount >= 10) { 4386 loopCount = loopCount / 5; // Line break runs slower than the others. 4387 } 4388 if (U_SUCCESS(status)) { 4389 RunMonkey(bi, m, "line", seed, loopCount, useUText); 4390 } 4391 else { 4392 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); 4393 } 4394 delete bi; 4395 } 4396 4397 if (breakType == "sent" || breakType == "all" ) { 4398 logln("Sentence Break Monkey Test"); 4399 RBBISentMonkey m; 4400 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status); 4401 if (loopCount >= 10) { 4402 loopCount = loopCount / 10; // Sentence runs slower than the other break types 4403 } 4404 if (U_SUCCESS(status)) { 4405 RunMonkey(bi, m, "sentence", seed, loopCount, useUText); 4406 } 4407 else { 4408 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); 4409 } 4410 delete bi; 4411 } 4412 4413 #endif 4414 } 4415 4416 // 4417 // Run a RBBI monkey test. Common routine, for all break iterator types. 4418 // Parameters: 4419 // bi - the break iterator to use 4420 // mk - MonkeyKind, abstraction for obtaining expected results 4421 // name - Name of test (char, word, etc.) for use in error messages 4422 // seed - Seed for starting random number generator (parameter from user) 4423 // numIterations 4424 // 4425 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed, 4426 int32_t numIterations, UBool useUText) { 4427 4428 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 4429 4430 const int32_t TESTSTRINGLEN = 500; 4431 UnicodeString testText; 4432 int32_t numCharClasses; 4433 UVector *chClasses; 4434 int expected[TESTSTRINGLEN*2 + 1]; 4435 int expectedCount = 0; 4436 char expectedBreaks[TESTSTRINGLEN*2 + 1]; 4437 char forwardBreaks[TESTSTRINGLEN*2 + 1]; 4438 char reverseBreaks[TESTSTRINGLEN*2+1]; 4439 char isBoundaryBreaks[TESTSTRINGLEN*2+1]; 4440 char followingBreaks[TESTSTRINGLEN*2+1]; 4441 char precedingBreaks[TESTSTRINGLEN*2+1]; 4442 int i; 4443 int loopCount = 0; 4444 4445 m_seed = seed; 4446 4447 numCharClasses = mk.charClasses()->size(); 4448 chClasses = mk.charClasses(); 4449 4450 // Check for errors that occured during the construction of the MonkeyKind object. 4451 // Can't report them where they occured because errln() is a method coming from intlTest, 4452 // and is not visible outside of RBBITest :-( 4453 if (U_FAILURE(mk.deferredStatus)) { 4454 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus)); 4455 return; 4456 } 4457 4458 // Verify that the character classes all have at least one member. 4459 for (i=0; i<numCharClasses; i++) { 4460 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i); 4461 if (s == NULL || s->size() == 0) { 4462 errln("Character Class #%d is null or of zero size.", i); 4463 return; 4464 } 4465 } 4466 4467 while (loopCount < numIterations || numIterations == -1) { 4468 if (numIterations == -1 && loopCount % 10 == 0) { 4469 // If test is running in an infinite loop, display a periodic tic so 4470 // we can tell that it is making progress. 4471 fprintf(stderr, "."); 4472 } 4473 // Save current random number seed, so that we can recreate the random numbers 4474 // for this loop iteration in event of an error. 4475 seed = m_seed; 4476 4477 // Populate a test string with data. 4478 testText.truncate(0); 4479 for (i=0; i<TESTSTRINGLEN; i++) { 4480 int32_t aClassNum = m_rand() % numCharClasses; 4481 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum); 4482 int32_t charIdx = m_rand() % classSet->size(); 4483 UChar32 c = classSet->charAt(charIdx); 4484 if (c < 0) { // TODO: deal with sets containing strings. 4485 errln("c < 0"); 4486 break; 4487 } 4488 testText.append(c); 4489 } 4490 4491 // Calculate the expected results for this test string. 4492 mk.setText(testText); 4493 memset(expectedBreaks, 0, sizeof(expectedBreaks)); 4494 expectedBreaks[0] = 1; 4495 int32_t breakPos = 0; 4496 expectedCount = 0; 4497 for (;;) { 4498 breakPos = mk.next(breakPos); 4499 if (breakPos == -1) { 4500 break; 4501 } 4502 if (breakPos > testText.length()) { 4503 errln("breakPos > testText.length()"); 4504 } 4505 expectedBreaks[breakPos] = 1; 4506 U_ASSERT(expectedCount<testText.length()); 4507 expected[expectedCount ++] = breakPos; 4508 } 4509 4510 // Find the break positions using forward iteration 4511 memset(forwardBreaks, 0, sizeof(forwardBreaks)); 4512 if (useUText) { 4513 UErrorCode status = U_ZERO_ERROR; 4514 UText *testUText = utext_openReplaceable(NULL, &testText, &status); 4515 // testUText = utext_openUnicodeString(testUText, &testText, &status); 4516 bi->setText(testUText, status); 4517 TEST_ASSERT_SUCCESS(status); 4518 utext_close(testUText); // The break iterator does a shallow clone of the UText 4519 // This UText can be closed immediately, so long as the 4520 // testText string continues to exist. 4521 } else { 4522 bi->setText(testText); 4523 } 4524 4525 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) { 4526 if (i < 0 || i > testText.length()) { 4527 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name); 4528 break; 4529 } 4530 forwardBreaks[i] = 1; 4531 } 4532 4533 // Find the break positions using reverse iteration 4534 memset(reverseBreaks, 0, sizeof(reverseBreaks)); 4535 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) { 4536 if (i < 0 || i > testText.length()) { 4537 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name); 4538 break; 4539 } 4540 reverseBreaks[i] = 1; 4541 } 4542 4543 // Find the break positions using isBoundary() tests. 4544 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks)); 4545 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length()); 4546 for (i=0; i<=testText.length(); i++) { 4547 isBoundaryBreaks[i] = bi->isBoundary(i); 4548 } 4549 4550 4551 // Find the break positions using the following() function. 4552 // printf("."); 4553 memset(followingBreaks, 0, sizeof(followingBreaks)); 4554 int32_t lastBreakPos = 0; 4555 followingBreaks[0] = 1; 4556 for (i=0; i<testText.length(); i++) { 4557 breakPos = bi->following(i); 4558 if (breakPos <= i || 4559 breakPos < lastBreakPos || 4560 breakPos > testText.length() || 4561 breakPos > lastBreakPos && lastBreakPos > i ) { 4562 errln("%s break monkey test: " 4563 "Out of range value returned by BreakIterator::following().\n" 4564 "Random seed=%d index=%d; following returned %d; lastbreak=%d", 4565 name, seed, i, breakPos, lastBreakPos); 4566 break; 4567 } 4568 followingBreaks[breakPos] = 1; 4569 lastBreakPos = breakPos; 4570 } 4571 4572 // Find the break positions using the preceding() function. 4573 memset(precedingBreaks, 0, sizeof(precedingBreaks)); 4574 lastBreakPos = testText.length(); 4575 precedingBreaks[testText.length()] = 1; 4576 for (i=testText.length(); i>0; i--) { 4577 breakPos = bi->preceding(i); 4578 if (breakPos >= i || 4579 breakPos > lastBreakPos || 4580 breakPos < 0 && testText.getChar32Start(i)>0 || 4581 breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i) ) { 4582 errln("%s break monkey test: " 4583 "Out of range value returned by BreakIterator::preceding().\n" 4584 "index=%d; prev returned %d; lastBreak=%d" , 4585 name, i, breakPos, lastBreakPos); 4586 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) { 4587 precedingBreaks[i] = 2; // Forces an error. 4588 } 4589 } else { 4590 if (breakPos >= 0) { 4591 precedingBreaks[breakPos] = 1; 4592 } 4593 lastBreakPos = breakPos; 4594 } 4595 } 4596 4597 // Compare the expected and actual results. 4598 for (i=0; i<=testText.length(); i++) { 4599 const char *errorType = NULL; 4600 if (forwardBreaks[i] != expectedBreaks[i]) { 4601 errorType = "next()"; 4602 } else if (reverseBreaks[i] != forwardBreaks[i]) { 4603 errorType = "previous()"; 4604 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) { 4605 errorType = "isBoundary()"; 4606 } else if (followingBreaks[i] != expectedBreaks[i]) { 4607 errorType = "following()"; 4608 } else if (precedingBreaks[i] != expectedBreaks[i]) { 4609 errorType = "preceding()"; 4610 } 4611 4612 4613 if (errorType != NULL) { 4614 // Format a range of the test text that includes the failure as 4615 // a data item that can be included in the rbbi test data file. 4616 4617 // Start of the range is the last point where expected and actual results 4618 // both agreed that there was a break position. 4619 int startContext = i; 4620 int32_t count = 0; 4621 for (;;) { 4622 if (startContext==0) { break; } 4623 startContext --; 4624 if (expectedBreaks[startContext] != 0) { 4625 if (count == 2) break; 4626 count ++; 4627 } 4628 } 4629 4630 // End of range is two expected breaks past the start position. 4631 int endContext = i + 1; 4632 int ci; 4633 for (ci=0; ci<2; ci++) { // Number of items to include in error text. 4634 for (;;) { 4635 if (endContext >= testText.length()) {break;} 4636 if (expectedBreaks[endContext-1] != 0) { 4637 if (count == 0) break; 4638 count --; 4639 } 4640 endContext ++; 4641 } 4642 } 4643 4644 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>" 4645 UnicodeString errorText = "<data>"; 4646 /***if (strcmp(errorType, "next()") == 0) { 4647 startContext = 0; 4648 endContext = testText.length(); 4649 4650 printStringBreaks(testText, expected, expectedCount); 4651 }***/ 4652 4653 for (ci=startContext; ci<endContext;) { 4654 UnicodeString hexChars("0123456789abcdef"); 4655 UChar32 c; 4656 int bn; 4657 c = testText.char32At(ci); 4658 if (ci == i) { 4659 // This is the location of the error. 4660 errorText.append("<?>"); 4661 } else if (expectedBreaks[ci] != 0) { 4662 // This a non-error expected break position. 4663 errorText.append("\\"); 4664 } 4665 if (c < 0x10000) { 4666 errorText.append("\\u"); 4667 for (bn=12; bn>=0; bn-=4) { 4668 errorText.append(hexChars.charAt((c>>bn)&0xf)); 4669 } 4670 } else { 4671 errorText.append("\\U"); 4672 for (bn=28; bn>=0; bn-=4) { 4673 errorText.append(hexChars.charAt((c>>bn)&0xf)); 4674 } 4675 } 4676 ci = testText.moveIndex32(ci, 1); 4677 } 4678 errorText.append("\\"); 4679 errorText.append("</data>\n"); 4680 4681 // Output the error 4682 char charErrorTxt[500]; 4683 UErrorCode status = U_ZERO_ERROR; 4684 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status); 4685 charErrorTxt[sizeof(charErrorTxt)-1] = 0; 4686 errln("%s break monkey test error. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s", 4687 name, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"), 4688 errorType, seed, i, charErrorTxt); 4689 break; 4690 } 4691 } 4692 4693 loopCount++; 4694 } 4695 #endif 4696 } 4697 4698 // 4699 // TestDebug - A place-holder test for debugging purposes. 4700 // For putting in fragments of other tests that can be invoked 4701 // for tracing without a lot of unwanted extra stuff happening. 4702 // 4703 void RBBITest::TestDebug(void) { 4704 #if 0 4705 UErrorCode status = U_ZERO_ERROR; 4706 int pos = 0; 4707 int ruleStatus = 0; 4708 4709 RuleBasedBreakIterator* bi = 4710 // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); 4711 // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status); 4712 (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status); 4713 UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e"); 4714 // UnicodeString s("Aaa. Bcd"); 4715 s = s.unescape(); 4716 bi->setText(s); 4717 UBool r = bi->isBoundary(8); 4718 printf("%s", r?"true":"false"); 4719 return; 4720 pos = bi->last(); 4721 do { 4722 // ruleStatus = bi->getRuleStatus(); 4723 printf("%d\t%d\n", pos, ruleStatus); 4724 pos = bi->previous(); 4725 } while (pos != BreakIterator::DONE); 4726 #endif 4727 } 4728 4729 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 4730