1 /******************************************************************** 2 * COPYRIGHT: 3 * Copyright (c) 1999-2010, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ********************************************************************/ 6 /************************************************************************ 7 * Date Name Description 8 * 12/15/99 Madhu Creation. 9 * 01/12/2000 Madhu Updated for changed API and added new tests 10 ************************************************************************/ 11 12 #include <typeinfo> // for 'typeid' to work 13 14 #include "unicode/utypes.h" 15 16 #if !UCONFIG_NO_BREAK_ITERATION 17 18 #include "unicode/utypes.h" 19 #include "unicode/brkiter.h" 20 #include "unicode/rbbi.h" 21 #include "unicode/uchar.h" 22 #include "unicode/utf16.h" 23 #include "unicode/ucnv.h" 24 #include "unicode/schriter.h" 25 #include "unicode/uniset.h" 26 #include "unicode/regex.h" // TODO: make conditional on regexp being built. 27 #include "unicode/ustring.h" 28 #include "unicode/utext.h" 29 #include "intltest.h" 30 #include "rbbitst.h" 31 #include <string.h> 32 #include "uvector.h" 33 #include "uvectr32.h" 34 #include "triedict.h" 35 #include <string.h> 36 #include <stdio.h> 37 #include <stdlib.h> 38 39 #define TEST_ASSERT(x) {if (!(x)) { \ 40 errln("Failure in file %s, line %d", __FILE__, __LINE__);}} 41 42 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \ 43 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}} 44 45 46 //--------------------------------------------- 47 // runIndexedTest 48 //--------------------------------------------- 49 50 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params ) 51 { 52 if (exec) logln("TestSuite RuleBasedBreakIterator: "); 53 54 switch (index) { 55 #if !UCONFIG_NO_FILE_IO 56 case 0: name = "TestBug4153072"; 57 if(exec) TestBug4153072(); break; 58 #else 59 case 0: name = "skip"; 60 break; 61 #endif 62 63 case 1: name = "TestJapaneseLineBreak"; 64 if(exec) TestJapaneseLineBreak(); break; 65 case 2: name = "TestStatusReturn"; 66 if(exec) TestStatusReturn(); break; 67 68 #if !UCONFIG_NO_FILE_IO 69 case 3: name = "TestUnicodeFiles"; 70 if(exec) TestUnicodeFiles(); break; 71 case 4: name = "TestEmptyString"; 72 if(exec) TestEmptyString(); break; 73 #else 74 case 3: case 4: name = "skip"; 75 break; 76 #endif 77 78 case 5: name = "TestGetAvailableLocales"; 79 if(exec) TestGetAvailableLocales(); break; 80 81 case 6: name = "TestGetDisplayName"; 82 if(exec) TestGetDisplayName(); break; 83 84 #if !UCONFIG_NO_FILE_IO 85 case 7: name = "TestEndBehaviour"; 86 if(exec) TestEndBehaviour(); break; 87 case 8: name = "TestMixedThaiLineBreak"; 88 if(exec) TestMixedThaiLineBreak(); break; 89 case 9: name = "TestThaiLineBreak"; 90 if(exec) TestThaiLineBreak(); break; 91 case 10: name = "TestMaiyamok"; 92 if(exec) TestMaiyamok(); break; 93 case 11: name = "TestWordBreaks"; 94 if(exec) TestWordBreaks(); break; 95 case 12: name = "TestWordBoundary"; 96 if(exec) TestWordBoundary(); break; 97 case 13: name = "TestLineBreaks"; 98 if(exec) TestLineBreaks(); break; 99 case 14: name = "TestSentBreaks"; 100 if(exec) TestSentBreaks(); break; 101 case 15: name = "TestExtended"; 102 if(exec) TestExtended(); break; 103 #else 104 case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip"; 105 break; 106 #endif 107 108 case 16: 109 if(exec) { 110 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO 111 name = "TestMonkey"; 112 TestMonkey(params); 113 #else 114 name = "skip"; 115 #endif 116 } 117 break; 118 119 #if !UCONFIG_NO_FILE_IO 120 case 17: name = "TestBug3818"; 121 if(exec) TestBug3818(); break; 122 case 18: name = "TestJapaneseWordBreak"; 123 if(exec) TestJapaneseWordBreak(); break; 124 #else 125 case 17: case 18: name = "skip"; 126 break; 127 #endif 128 129 case 19: name = "TestDebug"; 130 if(exec) TestDebug(); break; 131 case 20: name = "TestTrieDict"; 132 if(exec) TestTrieDict(); break; 133 134 #if !UCONFIG_NO_FILE_IO 135 case 21: name = "TestBug5775"; 136 if (exec) TestBug5775(); break; 137 case 22: name = "TestThaiBreaks"; 138 if (exec) TestThaiBreaks(); break; 139 case 23: name = "TestTailoredBreaks"; 140 if (exec) TestTailoredBreaks(); break; 141 #else 142 case 21: case 22: case 23: name = "skip"; 143 break; 144 #endif 145 case 24: name = "TestDictRules"; 146 if (exec) TestDictRules(); break; 147 case 25: name = "TestBug5532"; 148 if (exec) TestBug5532(); break; 149 default: name = ""; break; //needed to end loop 150 } 151 } 152 153 154 //--------------------------------------------------------------------------- 155 // 156 // class BITestData Holds a set of Break iterator test data and results 157 // Includes 158 // - the string data to be broken 159 // - a vector of the expected break positions. 160 // - a vector of source line numbers for the data, 161 // (to help see where errors occured.) 162 // - The expected break tag values. 163 // - Vectors of actual break positions and tag values. 164 // - Functions for comparing actual with expected and 165 // reporting errors. 166 // 167 //---------------------------------------------------------------------------- 168 class BITestData { 169 public: 170 UnicodeString fDataToBreak; 171 UVector fExpectedBreakPositions; 172 UVector fExpectedTags; 173 UVector fLineNum; 174 UVector fActualBreakPositions; // Test Results. 175 UVector fActualTags; 176 177 BITestData(UErrorCode &status); 178 void addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status); 179 void checkResults(const char *heading, RBBITest *test); 180 void err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx); 181 void clearResults(); 182 }; 183 184 // 185 // Constructor. 186 // 187 BITestData::BITestData(UErrorCode &status) 188 : fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fActualBreakPositions(status), 189 fActualTags(status) 190 { 191 } 192 193 // 194 // addDataChunk. Add a section (non-breaking) piece if data to the test data. 195 // The macro form collects the line number, which is helpful 196 // when tracking down failures. 197 // 198 // A null data item is inserted at the start of each test's data 199 // to put the starting zero into the data list. The position saved for 200 // each non-null item is its ending position. 201 // 202 #define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status); 203 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) { 204 if (U_FAILURE(status)) {return;} 205 if (data != NULL) { 206 fDataToBreak.append(CharsToUnicodeString(data)); 207 } 208 fExpectedBreakPositions.addElement(fDataToBreak.length(), status); 209 fExpectedTags.addElement(tag, status); 210 fLineNum.addElement(lineNum, status); 211 } 212 213 214 // 215 // checkResults. Compare the actual and expected break positions, report any differences. 216 // 217 void BITestData::checkResults(const char *heading, RBBITest *test) { 218 int32_t expectedIndex = 0; 219 int32_t actualIndex = 0; 220 221 for (;;) { 222 // If we've run through both the expected and actual results vectors, we're done. 223 // break out of the loop. 224 if (expectedIndex >= fExpectedBreakPositions.size() && 225 actualIndex >= fActualBreakPositions.size()) { 226 break; 227 } 228 229 230 if (expectedIndex >= fExpectedBreakPositions.size()) { 231 err(heading, test, expectedIndex-1, actualIndex); 232 actualIndex++; 233 continue; 234 } 235 236 if (actualIndex >= fActualBreakPositions.size()) { 237 err(heading, test, expectedIndex, actualIndex-1); 238 expectedIndex++; 239 continue; 240 } 241 242 if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) { 243 err(heading, test, expectedIndex, actualIndex); 244 // Try to resync the positions of the indices, to avoid a rash of spurious erros. 245 if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) { 246 actualIndex++; 247 } else { 248 expectedIndex++; 249 } 250 continue; 251 } 252 253 if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) { 254 test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d", 255 heading, fLineNum.elementAt(expectedIndex), 256 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex)); 257 } 258 259 actualIndex++; 260 expectedIndex++; 261 } 262 } 263 264 // 265 // err - An error was found. Report it, along with information about where the 266 // incorrectly broken test data appeared in the source file. 267 // 268 void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx) 269 { 270 int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx); 271 int32_t actual = fActualBreakPositions.elementAti(actualIdx); 272 int32_t o = 0; 273 int32_t line = fLineNum.elementAti(expectedIdx); 274 if (expectedIdx > 0) { 275 // The line numbers are off by one because a premature break occurs somewhere 276 // within the previous item, rather than at the start of the current (expected) item. 277 // We want to report the offset of the unexpected break from the start of 278 // this previous item. 279 o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1); 280 } 281 if (actual < expected) { 282 test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading, o, line, actual, expected); 283 } else { 284 test->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading, line, actual, expected); 285 } 286 } 287 288 289 void BITestData::clearResults() { 290 fActualBreakPositions.removeAllElements(); 291 fActualTags.removeAllElements(); 292 } 293 294 295 //----------------------------------------------------------------------------------- 296 // 297 // Cannned Test Characters 298 // 299 //----------------------------------------------------------------------------------- 300 301 static const UChar cannedTestArray[] = { 302 0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031, 303 0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b, 304 0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2, 305 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3, 306 0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303, 307 0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000, 308 0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f, 309 0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000 310 }; 311 312 static UnicodeString* cannedTestChars = 0; 313 314 #define halfNA "\\u0928\\u094d\\u200d" 315 #define halfSA "\\u0938\\u094d\\u200d" 316 #define halfCHA "\\u091a\\u094d\\u200d" 317 #define halfKA "\\u0915\\u094d\\u200d" 318 #define deadTA "\\u0924\\u094d" 319 320 //-------------------------------------------------------------------------------------- 321 // 322 // RBBITest constructor and destructor 323 // 324 //-------------------------------------------------------------------------------------- 325 326 RBBITest::RBBITest() { 327 UnicodeString temp(cannedTestArray); 328 cannedTestChars = new UnicodeString(); 329 *cannedTestChars += (UChar)0x0000; 330 *cannedTestChars += temp; 331 } 332 333 334 RBBITest::~RBBITest() { 335 delete cannedTestChars; 336 } 337 338 339 static const int T_NUMBER = 100; 340 static const int T_LETTER = 200; 341 static const int T_H_OR_K = 300; 342 static const int T_IDEO = 400; 343 344 345 346 347 348 349 //-------------------------------------------------------------------- 350 //Testing the BreakIterator for devanagari script 351 //-------------------------------------------------------------------- 352 353 #define deadRA "\\u0930\\u094d" /*deadform RA = devanagari RA + virama*/ 354 #define deadPHA "\\u092b\\u094d" /*deadform PHA = devanagari PHA + virama*/ 355 #define deadTTHA "\\u0920\\u094d" 356 #define deadPA "\\u092a\\u094d" 357 #define deadSA "\\u0938\\u094d" 358 #define visarga "\\u0903" /*devanagari visarga looks like a english colon*/ 359 360 361 362 363 364 365 //----------------------------------------------------------------------------------- 366 // 367 // Test for status {tag} return value from break rules. 368 // TODO: a more thorough test. 369 // 370 //----------------------------------------------------------------------------------- 371 void RBBITest::TestStatusReturn() { 372 UnicodeString rulesString1("$Letters = [:L:];\n" 373 "$Numbers = [:N:];\n" 374 "$Letters+{1};\n" 375 "$Numbers+{2};\n" 376 "Help\\ {4}/me\\!;\n" 377 "[^$Letters $Numbers];\n" 378 "!.*;\n", -1, US_INV); 379 UnicodeString testString1 = "abc123..abc Help me Help me!"; 380 // 01234567890123456789012345678 381 int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1}; 382 int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1}; 383 384 UErrorCode status=U_ZERO_ERROR; 385 UParseError parseError; 386 387 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status); 388 if(U_FAILURE(status)) { 389 dataerrln("FAIL : in construction - %s", u_errorName(status)); 390 } else { 391 int32_t pos; 392 int32_t i = 0; 393 bi->setText(testString1); 394 for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) { 395 if (pos != bounds1[i]) { 396 errln("FAIL: expected break at %d, got %d\n", bounds1[i], pos); 397 break; 398 } 399 400 int tag = bi->getRuleStatus(); 401 if (tag != brkStatus[i]) { 402 errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag); 403 break; 404 } 405 i++; 406 } 407 } 408 delete bi; 409 } 410 411 412 static void printStringBreaks(UnicodeString ustr, int expected[], 413 int expectedcount) 414 { 415 UErrorCode status = U_ZERO_ERROR; 416 char name[100]; 417 printf("code alpha extend alphanum type word sent line name\n"); 418 int j; 419 for (j = 0; j < ustr.length(); j ++) { 420 if (expectedcount > 0) { 421 int k; 422 for (k = 0; k < expectedcount; k ++) { 423 if (j == expected[k]) { 424 printf("------------------------------------------------ %d\n", 425 j); 426 } 427 } 428 } 429 UChar32 c = ustr.char32At(j); 430 if (c > 0xffff) { 431 j ++; 432 } 433 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status); 434 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c, 435 u_isUAlphabetic(c), 436 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND), 437 u_isalnum(c), 438 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY, 439 u_charType(c), 440 U_SHORT_PROPERTY_NAME), 441 u_getPropertyValueName(UCHAR_WORD_BREAK, 442 u_getIntPropertyValue(c, 443 UCHAR_WORD_BREAK), 444 U_SHORT_PROPERTY_NAME), 445 u_getPropertyValueName(UCHAR_SENTENCE_BREAK, 446 u_getIntPropertyValue(c, 447 UCHAR_SENTENCE_BREAK), 448 U_SHORT_PROPERTY_NAME), 449 u_getPropertyValueName(UCHAR_LINE_BREAK, 450 u_getIntPropertyValue(c, 451 UCHAR_LINE_BREAK), 452 U_SHORT_PROPERTY_NAME), 453 name); 454 } 455 } 456 457 void RBBITest::TestThaiLineBreak() { 458 UErrorCode status = U_ZERO_ERROR; 459 BITestData thaiLineSelection(status); 460 461 // \u0e2f-- the Thai paiyannoi character-- isn't a letter. It's a symbol that 462 // represents elided letters at the end of a long word. It should be bound to 463 // the end of the word and not treated as an independent punctuation mark. 464 465 466 ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data 467 ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f", 0, status); 468 ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e30", 0, status); 469 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status); 470 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status); 471 // ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32", 0, status); 472 // ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status); 473 ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48", 0, status); 474 // the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us 475 ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e2d\\u0e01", 0, status); 476 ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32", 0, status); 477 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status); 478 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, status); 479 ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, status); 480 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status); 481 482 // the one time where the paiyannoi occurs somewhere other than at the end 483 // of a word is in the Thai abbrevation for "etc.", which both begins and 484 // ends with a paiyannoi 485 ADD_DATACHUNK(thaiLineSelection, "\\u0e2f\\u0e25\\u0e2f", 0, status); 486 ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status); 487 ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status); 488 489 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance( 490 Locale("th"), status); 491 if (U_FAILURE(status)) 492 { 493 errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestThaiLineBreak. - %s", u_errorName(status)); 494 return; 495 } 496 497 generalIteratorTest(*e, thaiLineSelection); 498 delete e; 499 } 500 501 502 503 void RBBITest::TestMixedThaiLineBreak() 504 { 505 UErrorCode status = U_ZERO_ERROR; 506 BITestData thaiLineSelection(status); 507 508 ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data 509 510 511 // @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters 512 // start 513 514 ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status); 515 ADD_DATACHUNK(thaiLineSelection, "\\u0E1E\\u0E38\\u0E17\\u0E18\\u0E28\\u0E31\\u0E01\\u0E23\\u0E32\\u0E0A ", 0, status); 516 ADD_DATACHUNK(thaiLineSelection, "2545 ", 0, status); 517 ADD_DATACHUNK(thaiLineSelection, "\\u0E40\\u0E1B\\u0E47\\u0E19", 0, status); 518 ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status); 519 ADD_DATACHUNK(thaiLineSelection, "\\u0E09\\u0E25\\u0E2D\\u0E07", 0, status); 520 ADD_DATACHUNK(thaiLineSelection, "\\u0E04\\u0E23\\u0E1A", 0, status); 521 ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E2D\\u0E1A ", 0, status); 522 ADD_DATACHUNK(thaiLineSelection, "\"\\u0E52\\u0E52\\u0E50 ", 0, status); 523 ADD_DATACHUNK(thaiLineSelection, "\\u0E1b\\u0E35\" ", 0, status); 524 ADD_DATACHUNK(thaiLineSelection, "\\u0E02\\u0E2d\\u0E07", 0, status); 525 ADD_DATACHUNK(thaiLineSelection, "\\u0E01\\u0E23\\u0E38\\u0E07", 0, status); 526 ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E31\\u0E15\\u0E19\\u0E42\\u0E01\\u0E2A\\u0E34\\u0E19\\u0E17\\u0E23\\u0E4C ", 0, status); 527 ADD_DATACHUNK(thaiLineSelection, "(\\u0E01\\u0E23\\u0E38\\u0E07\\u0E40\\u0E17\\u0E1e\\u0E2F", 0, status); 528 ADD_DATACHUNK(thaiLineSelection, "\\u0E2B\\u0E23\\u0E37\\u0E2D ", 0, status); 529 ADD_DATACHUNK(thaiLineSelection, "Bangkok)", 0, status); 530 531 // @suwit - end of changes 532 533 534 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale("th"), status); 535 if (U_FAILURE(status)) 536 { 537 errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak. - %s", u_errorName(status)); 538 return; 539 } 540 541 542 generalIteratorTest(*e, thaiLineSelection); 543 delete e; 544 } 545 546 547 void RBBITest::TestMaiyamok() 548 { 549 UErrorCode status = U_ZERO_ERROR; 550 BITestData thaiLineSelection(status); 551 ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data 552 // the Thai maiyamok character is a shorthand symbol that means "repeat the previous 553 // word". Instead of appearing as a word unto itself, however, it's kept together 554 // with the word before it 555 ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e1b\\u0e46", 0, status); 556 ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32\\u0e46", 0, status); 557 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status); 558 ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07", 0, status); 559 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e1e", 0, status); 560 ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e25\\u0e30", 0, status); 561 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35", 0, status); 562 ADD_DATACHUNK(thaiLineSelection, "\\u0e22\\u0e07", 0, status); 563 ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status); 564 565 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance( 566 Locale("th"), status); 567 568 if (U_FAILURE(status)) 569 { 570 errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMaiyamok. - %s", u_errorName(status)); 571 return; 572 } 573 generalIteratorTest(*e, thaiLineSelection); 574 delete e; 575 } 576 577 578 579 void RBBITest::TestBug3818() { 580 UErrorCode status = U_ZERO_ERROR; 581 582 // Four Thai words... 583 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 584 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 }; 585 UnicodeString thaiStr(thaiWordData); 586 587 RuleBasedBreakIterator* bi = 588 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status); 589 if (U_FAILURE(status) || bi == NULL) { 590 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status)); 591 return; 592 } 593 bi->setText(thaiStr); 594 595 int32_t startOfSecondWord = bi->following(1); 596 if (startOfSecondWord != 4) { 597 errln("Fail at file %s, line %d expected start of word at 4, got %d", 598 __FILE__, __LINE__, startOfSecondWord); 599 } 600 startOfSecondWord = bi->following(0); 601 if (startOfSecondWord != 4) { 602 errln("Fail at file %s, line %d expected start of word at 4, got %d", 603 __FILE__, __LINE__, startOfSecondWord); 604 } 605 delete bi; 606 } 607 608 609 void RBBITest::TestJapaneseWordBreak() { 610 UErrorCode status = U_ZERO_ERROR; 611 BITestData japaneseWordSelection(status); 612 613 ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status); // Break at start of data 614 ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2 615 ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5 616 ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7 617 ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10 618 ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11 619 ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12 620 621 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance( 622 Locale("ja"), status); 623 if (U_FAILURE(status)) 624 { 625 errcheckln(status, "Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n"); 626 return; 627 } 628 629 generalIteratorTest(*e, japaneseWordSelection); 630 delete e; 631 } 632 633 void RBBITest::TestTrieDict() { 634 UErrorCode status = U_ZERO_ERROR; 635 636 // 637 // Open and read the test data file. 638 // 639 const char *testDataDirectory = IntlTest::getSourceTestData(status); 640 char testFileName[1000]; 641 if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen("riwords.txt") + 10 >= sizeof(testFileName)) { 642 errln("Can't open test data. Path too long."); 643 return; 644 } 645 strcpy(testFileName, testDataDirectory); 646 strcat(testFileName, "riwords.txt"); 647 648 // Items needing deleting at the end 649 MutableTrieDictionary *mutableDict = NULL; 650 CompactTrieDictionary *compactDict = NULL; 651 UnicodeSet *breaks = NULL; 652 UChar *testFile = NULL; 653 StringEnumeration *enumer1 = NULL; 654 StringEnumeration *enumer2 = NULL; 655 MutableTrieDictionary *mutable2 = NULL; 656 StringEnumeration *cloneEnum = NULL; 657 CompactTrieDictionary *compact2 = NULL; 658 659 660 const UnicodeString *originalWord = NULL; 661 const UnicodeString *cloneWord = NULL; 662 UChar *current; 663 UChar *word; 664 UChar uc; 665 int32_t wordLen; 666 int32_t wordCount; 667 int32_t testCount; 668 669 int len; 670 testFile = ReadAndConvertFile(testFileName, len, NULL, status); 671 if (U_FAILURE(status)) { 672 goto cleanup; /* something went wrong, error already output */ 673 } 674 675 mutableDict = new MutableTrieDictionary(0x0E1C, status); 676 if (U_FAILURE(status)) { 677 errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status)); 678 goto cleanup; 679 } 680 681 breaks = new UnicodeSet; 682 breaks->add(0x000A); // Line Feed 683 breaks->add(0x000D); // Carriage Return 684 breaks->add(0x2028); // Line Separator 685 breaks->add(0x2029); // Paragraph Separator 686 687 // Now add each non-comment line of the file as a word. 688 current = testFile; 689 word = current; 690 uc = *current++; 691 wordLen = 0; 692 wordCount = 0; 693 694 while (uc) { 695 if (uc == 0x0023) { // #comment line, skip 696 while (uc && !breaks->contains(uc)) { 697 uc = *current++; 698 } 699 } 700 else while (uc && !breaks->contains(uc)) { 701 ++wordLen; 702 uc = *current++; 703 } 704 if (wordLen > 0) { 705 mutableDict->addWord(word, wordLen, status); 706 if (U_FAILURE(status)) { 707 errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status)); 708 goto cleanup; 709 } 710 wordCount += 1; 711 } 712 713 // Find beginning of next line 714 while (uc && breaks->contains(uc)) { 715 uc = *current++; 716 } 717 word = current-1; 718 wordLen = 0; 719 } 720 721 if (wordCount < 50) { 722 errln("Word count (%d) unreasonably small\n", wordCount); 723 goto cleanup; 724 } 725 726 enumer1 = mutableDict->openWords(status); 727 if (U_FAILURE(status)) { 728 errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status)); 729 goto cleanup; 730 } 731 732 testCount = 0; 733 if (wordCount != (testCount = enumer1->count(status))) { 734 errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n", 735 testCount, wordCount, u_errorName(status)); 736 goto cleanup; 737 } 738 739 // Now compact it 740 compactDict = new CompactTrieDictionary(*mutableDict, status); 741 if (U_FAILURE(status)) { 742 errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status)); 743 goto cleanup; 744 } 745 746 enumer2 = compactDict->openWords(status); 747 if (U_FAILURE(status)) { 748 errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status)); 749 goto cleanup; 750 } 751 752 if (wordCount != (testCount = enumer2->count(status))) { 753 errln("CompactTrieDictionary word count (%d) differs from file word count (%d), with status %s\n", 754 testCount, wordCount, u_errorName(status)); 755 goto cleanup; 756 } 757 758 if (typeid(*enumer1) == typeid(*enumer2)) { 759 errln("CompactTrieEnumeration and MutableTrieEnumeration typeids are the same"); 760 } 761 delete enumer1; 762 enumer1 = NULL; 763 delete enumer2; 764 enumer2 = NULL; 765 766 // Now un-compact it 767 mutable2 = compactDict->cloneMutable(status); 768 if (U_FAILURE(status)) { 769 errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status)); 770 goto cleanup; 771 } 772 773 cloneEnum = mutable2->openWords(status); 774 if (U_FAILURE(status)) { 775 errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status)); 776 goto cleanup; 777 } 778 779 if (wordCount != (testCount = cloneEnum->count(status))) { 780 errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n", 781 testCount, wordCount, u_errorName(status)); 782 goto cleanup; 783 } 784 785 // Compact original dictionary to clone. Note that we can only compare the same kind of 786 // dictionary as the order of the enumerators is not guaranteed to be the same between 787 // different kinds 788 enumer1 = mutableDict->openWords(status); 789 if (U_FAILURE(status)) { 790 errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status)); 791 goto cleanup; 792 } 793 794 originalWord = enumer1->snext(status); 795 cloneWord = cloneEnum->snext(status); 796 while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) { 797 if (*originalWord != *cloneWord) { 798 errln("Original and cloned MutableTrieDictionary word mismatch\n"); 799 goto cleanup; 800 } 801 originalWord = enumer1->snext(status); 802 cloneWord = cloneEnum->snext(status); 803 } 804 805 if (U_FAILURE(status)) { 806 errln("Enumeration failed: %s\n", u_errorName(status)); 807 goto cleanup; 808 } 809 810 if (originalWord != cloneWord) { 811 errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n"); 812 goto cleanup; 813 } 814 815 // Test the data copying constructor for CompactTrieDict, and the data access APIs. 816 compact2 = new CompactTrieDictionary(compactDict->data(), status); 817 if (U_FAILURE(status)) { 818 errln("CompactTrieDictionary(const void *,...) failed\n"); 819 goto cleanup; 820 } 821 822 if (compact2->dataSize() == 0) { 823 errln("CompactTrieDictionary->dataSize() == 0\n"); 824 goto cleanup; 825 } 826 827 // Now count the words via the second dictionary 828 delete enumer1; 829 enumer1 = compact2->openWords(status); 830 if (U_FAILURE(status)) { 831 errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status)); 832 goto cleanup; 833 } 834 835 if (wordCount != (testCount = enumer1->count(status))) { 836 errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n", 837 testCount, wordCount, u_errorName(status)); 838 goto cleanup; 839 } 840 841 cleanup: 842 delete compactDict; 843 delete mutableDict; 844 delete breaks; 845 delete[] testFile; 846 delete enumer1; 847 delete mutable2; 848 delete cloneEnum; 849 delete compact2; 850 } 851 852 853 //---------------------------------------------------------------------------- 854 // 855 // generalIteratorTest Given a break iterator and a set of test data, 856 // Run the tests and report the results. 857 // 858 //---------------------------------------------------------------------------- 859 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td) 860 { 861 862 bi.setText(td.fDataToBreak); 863 864 testFirstAndNext(bi, td); 865 866 testLastAndPrevious(bi, td); 867 868 testFollowing(bi, td); 869 testPreceding(bi, td); 870 testIsBoundary(bi, td); 871 doMultipleSelectionTest(bi, td); 872 } 873 874 875 // 876 // testFirstAndNext. Run the iterator forwards in the obvious first(), next() 877 // kind of loop. 878 // 879 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td) 880 { 881 UErrorCode status = U_ZERO_ERROR; 882 int32_t p; 883 int32_t lastP = -1; 884 int32_t tag; 885 886 logln("Test first and next"); 887 bi.setText(td.fDataToBreak); 888 td.clearResults(); 889 890 for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) { 891 td.fActualBreakPositions.addElement(p, status); // Save result. 892 tag = bi.getRuleStatus(); 893 td.fActualTags.addElement(tag, status); 894 if (p <= lastP) { 895 // If the iterator is not making forward progress, stop. 896 // No need to raise an error here, it'll be detected in the normal check of results. 897 break; 898 } 899 lastP = p; 900 } 901 td.checkResults("testFirstAndNext", this); 902 } 903 904 905 // 906 // TestLastAndPrevious. Run the iterator backwards, starting with last(). 907 // 908 void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td) 909 { 910 UErrorCode status = U_ZERO_ERROR; 911 int32_t p; 912 int32_t lastP = 0x7ffffffe; 913 int32_t tag; 914 915 logln("Test last and previous"); 916 bi.setText(td.fDataToBreak); 917 td.clearResults(); 918 919 for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) { 920 // Save break position. Insert it at start of vector of results, shoving 921 // already-saved results further towards the end. 922 td.fActualBreakPositions.insertElementAt(p, 0, status); 923 // bi.previous(); // TODO: Why does this fix things up???? 924 // bi.next(); 925 tag = bi.getRuleStatus(); 926 td.fActualTags.insertElementAt(tag, 0, status); 927 if (p >= lastP) { 928 // If the iterator is not making progress, stop. 929 // No need to raise an error here, it'll be detected in the normal check of results. 930 break; 931 } 932 lastP = p; 933 } 934 td.checkResults("testLastAndPrevious", this); 935 } 936 937 938 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td) 939 { 940 UErrorCode status = U_ZERO_ERROR; 941 int32_t p; 942 int32_t tag; 943 int32_t lastP = -2; // A value that will never be returned as a break position. 944 // cannot be -1; that is returned for DONE. 945 int i; 946 947 logln("testFollowing():"); 948 bi.setText(td.fDataToBreak); 949 td.clearResults(); 950 951 // Save the starting point, since we won't get that out of following. 952 p = bi.first(); 953 td.fActualBreakPositions.addElement(p, status); // Save result. 954 tag = bi.getRuleStatus(); 955 td.fActualTags.addElement(tag, status); 956 957 for (i = 0; i <= td.fDataToBreak.length()+1; i++) { 958 p = bi.following(i); 959 if (p != lastP) { 960 if (p == RuleBasedBreakIterator::DONE) { 961 break; 962 } 963 // We've reached a new break position. Save it. 964 td.fActualBreakPositions.addElement(p, status); // Save result. 965 tag = bi.getRuleStatus(); 966 td.fActualTags.addElement(tag, status); 967 lastP = p; 968 } 969 } 970 // The loop normally exits by means of the break in the middle. 971 // Make sure that the index was at the correct position for the break iterator to have 972 // returned DONE. 973 if (i != td.fDataToBreak.length()) { 974 errln("testFollowing(): iterator returned DONE prematurely."); 975 } 976 977 // Full check of all results. 978 td.checkResults("testFollowing", this); 979 } 980 981 982 983 void RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) { 984 UErrorCode status = U_ZERO_ERROR; 985 int32_t p; 986 int32_t tag; 987 int32_t lastP = 0x7ffffffe; 988 int i; 989 990 logln("testPreceding():"); 991 bi.setText(td.fDataToBreak); 992 td.clearResults(); 993 994 p = bi.last(); 995 td.fActualBreakPositions.addElement(p, status); 996 tag = bi.getRuleStatus(); 997 td.fActualTags.addElement(tag, status); 998 999 for (i = td.fDataToBreak.length(); i>=-1; i--) { 1000 p = bi.preceding(i); 1001 if (p != lastP) { 1002 if (p == RuleBasedBreakIterator::DONE) { 1003 break; 1004 } 1005 // We've reached a new break position. Save it. 1006 td.fActualBreakPositions.insertElementAt(p, 0, status); 1007 lastP = p; 1008 tag = bi.getRuleStatus(); 1009 td.fActualTags.insertElementAt(tag, 0, status); 1010 } 1011 } 1012 // The loop normally exits by means of the break in the middle. 1013 // Make sure that the index was at the correct position for the break iterator to have 1014 // returned DONE. 1015 if (i != 0) { 1016 errln("testPreceding(): iterator returned DONE prematurely."); 1017 } 1018 1019 // Full check of all results. 1020 td.checkResults("testPreceding", this); 1021 } 1022 1023 1024 1025 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) { 1026 UErrorCode status = U_ZERO_ERROR; 1027 int i; 1028 int32_t tag; 1029 1030 logln("testIsBoundary():"); 1031 bi.setText(td.fDataToBreak); 1032 td.clearResults(); 1033 1034 for (i = 0; i <= td.fDataToBreak.length(); i++) { 1035 if (bi.isBoundary(i)) { 1036 td.fActualBreakPositions.addElement(i, status); // Save result. 1037 tag = bi.getRuleStatus(); 1038 td.fActualTags.addElement(tag, status); 1039 } 1040 } 1041 td.checkResults("testIsBoundary: ", this); 1042 } 1043 1044 1045 1046 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td) 1047 { 1048 iterator.setText(td.fDataToBreak); 1049 1050 RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone(); 1051 int32_t offset = iterator.first(); 1052 int32_t testOffset; 1053 int32_t count = 0; 1054 1055 logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length()); 1056 1057 if (*testIterator != iterator) 1058 errln("clone() or operator!= failed: two clones compared unequal"); 1059 1060 do { 1061 testOffset = testIterator->first(); 1062 testOffset = testIterator->next(count); 1063 if (offset != testOffset) 1064 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset); 1065 1066 if (offset != RuleBasedBreakIterator::DONE) { 1067 count++; 1068 offset = iterator.next(); 1069 1070 if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) { 1071 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset); 1072 if (count > 10000 || offset == -1) { 1073 errln("operator== failed too many times. Stopping test."); 1074 if (offset == -1) { 1075 errln("Does (RuleBasedBreakIterator::DONE == -1)?"); 1076 } 1077 return; 1078 } 1079 } 1080 } 1081 } while (offset != RuleBasedBreakIterator::DONE); 1082 1083 // now do it backwards... 1084 offset = iterator.last(); 1085 count = 0; 1086 1087 do { 1088 testOffset = testIterator->last(); 1089 testOffset = testIterator->next(count); // next() with a negative arg is same as previous 1090 if (offset != testOffset) 1091 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset); 1092 1093 if (offset != RuleBasedBreakIterator::DONE) { 1094 count--; 1095 offset = iterator.previous(); 1096 } 1097 } while (offset != RuleBasedBreakIterator::DONE); 1098 1099 delete testIterator; 1100 } 1101 1102 1103 //--------------------------------------------- 1104 // 1105 // other tests 1106 // 1107 //--------------------------------------------- 1108 void RBBITest::TestEmptyString() 1109 { 1110 UnicodeString text = ""; 1111 UErrorCode status = U_ZERO_ERROR; 1112 1113 BITestData x(status); 1114 ADD_DATACHUNK(x, "", 0, status); // Break at start of data 1115 RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); 1116 if (U_FAILURE(status)) 1117 { 1118 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status)); 1119 return; 1120 } 1121 generalIteratorTest(*bi, x); 1122 delete bi; 1123 } 1124 1125 void RBBITest::TestGetAvailableLocales() 1126 { 1127 int32_t locCount = 0; 1128 const Locale* locList = BreakIterator::getAvailableLocales(locCount); 1129 1130 if (locCount == 0) 1131 dataerrln("getAvailableLocales() returned an empty list!"); 1132 // Just make sure that it's returning good memory. 1133 int32_t i; 1134 for (i = 0; i < locCount; ++i) { 1135 logln(locList[i].getName()); 1136 } 1137 } 1138 1139 //Testing the BreakIterator::getDisplayName() function 1140 void RBBITest::TestGetDisplayName() 1141 { 1142 UnicodeString result; 1143 1144 BreakIterator::getDisplayName(Locale::getUS(), result); 1145 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)") 1146 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \"" 1147 + result); 1148 1149 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result); 1150 if (result != "French (France)") 1151 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \"" 1152 + result); 1153 } 1154 /** 1155 * Test End Behaviour 1156 * @bug 4068137 1157 */ 1158 void RBBITest::TestEndBehaviour() 1159 { 1160 UErrorCode status = U_ZERO_ERROR; 1161 UnicodeString testString("boo."); 1162 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status); 1163 if (U_FAILURE(status)) 1164 { 1165 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status)); 1166 return; 1167 } 1168 wb->setText(testString); 1169 1170 if (wb->first() != 0) 1171 errln("Didn't get break at beginning of string."); 1172 if (wb->next() != 3) 1173 errln("Didn't get break before period in \"boo.\""); 1174 if (wb->current() != 4 && wb->next() != 4) 1175 errln("Didn't get break at end of string."); 1176 delete wb; 1177 } 1178 /* 1179 * @bug 4153072 1180 */ 1181 void RBBITest::TestBug4153072() { 1182 UErrorCode status = U_ZERO_ERROR; 1183 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status); 1184 if (U_FAILURE(status)) 1185 { 1186 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status)); 1187 return; 1188 } 1189 UnicodeString str("...Hello, World!..."); 1190 int32_t begin = 3; 1191 int32_t end = str.length() - 3; 1192 UBool onBoundary; 1193 1194 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin); 1195 iter->adoptText(textIterator); 1196 int index; 1197 // Note: with the switch to UText, there is no way to restrict the 1198 // iteration range to begin at an index other than zero. 1199 // String character iterators created with a non-zero bound are 1200 // treated by RBBI as being empty. 1201 for (index = -1; index < begin + 1; ++index) { 1202 onBoundary = iter->isBoundary(index); 1203 if (index == 0? !onBoundary : onBoundary) { 1204 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index + 1205 " and begin index = " + begin); 1206 } 1207 } 1208 delete iter; 1209 } 1210 1211 1212 // 1213 // Test for problem reported by Ashok Matoria on 9 July 2007 1214 // One.<kSoftHyphen><kSpace>Two. 1215 // 1216 // Sentence break at start (0) and then on calling next() it breaks at 1217 // 'T' of "Two". Now, at this point if I do next() and 1218 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two". 1219 // 1220 void RBBITest::TestBug5775() { 1221 UErrorCode status = U_ZERO_ERROR; 1222 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status); 1223 TEST_ASSERT_SUCCESS(status); 1224 if (U_FAILURE(status)) { 1225 return; 1226 } 1227 // Check for status first for better handling of no data errors. 1228 TEST_ASSERT(bi != NULL); 1229 if (bi == NULL) { 1230 return; 1231 } 1232 1233 UnicodeString s("One.\\u00ad Two.", -1, US_INV); 1234 // 01234 56789 1235 s = s.unescape(); 1236 bi->setText(s); 1237 int pos = bi->next(); 1238 TEST_ASSERT(pos == 6); 1239 pos = bi->next(); 1240 TEST_ASSERT(pos == 10); 1241 pos = bi->previous(); 1242 TEST_ASSERT(pos == 6); 1243 delete bi; 1244 } 1245 1246 1247 1248 /** 1249 * Test Japanese Line Break 1250 * @bug 4095322 1251 */ 1252 void RBBITest::TestJapaneseLineBreak() 1253 { 1254 #if 0 1255 // Test needs updating some more... Dump it for now. 1256 1257 1258 // Change for Unicode TR 14: Punctuation characters with categories Pi and Pf do not count 1259 // as opening and closing punctuation for line breaking. 1260 // Also, \u30fc and \u30fe are not counted as hyphens. Remove these chars 1261 // from these tests. 6-13-2002 1262 // 1263 UErrorCode status = U_ZERO_ERROR; 1264 UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c"); 1265 UnicodeString precedingChars = CharsToUnicodeString( 1266 //"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f"); 1267 "([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e"); 1268 UnicodeString followingChars = CharsToUnicodeString( 1269 // ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc" 1270 ")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7" 1271 // ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034" 1272 ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034" 1273 "\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302"); 1274 BreakIterator *iter = BreakIterator::createLineInstance(Locale::getJapan(), status); 1275 1276 int32_t i; 1277 if (U_FAILURE(status)) 1278 { 1279 errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n"); 1280 return; 1281 } 1282 1283 for (i = 0; i < precedingChars.length(); i++) { 1284 testString.setCharAt(1, precedingChars[i]); 1285 iter->setText(testString); 1286 int32_t j = iter->first(); 1287 if (j != 0) 1288 errln("ja line break failure: failed to start at 0"); 1289 j = iter->next(); 1290 if (j != 1) 1291 errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars[i]) 1292 + "' (" + ((int)(precedingChars[i])) + ")"); 1293 j = iter->next(); 1294 if (j != 3) 1295 errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars[i]) 1296 + "' (" + ((int)(precedingChars[i])) + ")"); 1297 } 1298 1299 for (i = 0; i < followingChars.length(); i++) { 1300 testString.setCharAt(1, followingChars[i]); 1301 iter->setText(testString); 1302 int j = iter->first(); 1303 if (j != 0) 1304 errln("ja line break failure: failed to start at 0"); 1305 j = iter->next(); 1306 if (j != 2) 1307 errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars[i]) 1308 + "' (" + ((int)(followingChars[i])) + ")"); 1309 j = iter->next(); 1310 if (j != 3) 1311 errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars[i]) 1312 + "' (" + ((int)(followingChars[i])) + ")"); 1313 } 1314 delete iter; 1315 #endif 1316 } 1317 1318 1319 //------------------------------------------------------------------------------ 1320 // 1321 // RBBITest::Extended Run RBBI Tests from an external test data file 1322 // 1323 //------------------------------------------------------------------------------ 1324 1325 struct TestParams { 1326 BreakIterator *bi; 1327 UnicodeString dataToBreak; 1328 UVector32 *expectedBreaks; 1329 UVector32 *srcLine; 1330 UVector32 *srcCol; 1331 }; 1332 1333 void RBBITest::executeTest(TestParams *t) { 1334 int32_t bp; 1335 int32_t prevBP; 1336 int32_t i; 1337 1338 if (t->bi == NULL) { 1339 return; 1340 } 1341 1342 t->bi->setText(t->dataToBreak); 1343 // 1344 // Run the iterator forward 1345 // 1346 prevBP = -1; 1347 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) { 1348 if (prevBP == bp) { 1349 // Fail for lack of forward progress. 1350 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d", 1351 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 1352 break; 1353 } 1354 1355 // Check that there were we didn't miss an expected break between the last one 1356 // and this one. 1357 for (i=prevBP+1; i<bp; i++) { 1358 if (t->expectedBreaks->elementAti(i) != 0) { 1359 int expected[] = {0, i}; 1360 printStringBreaks(t->dataToBreak, expected, 2); 1361 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1362 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 1363 } 1364 } 1365 1366 // Check that the break we did find was expected 1367 if (t->expectedBreaks->elementAti(bp) == 0) { 1368 int expected[] = {0, bp}; 1369 printStringBreaks(t->dataToBreak, expected, 2); 1370 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d", 1371 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 1372 } else { 1373 // The break was expected. 1374 // Check that the {nnn} tag value is correct. 1375 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp); 1376 if (expectedTagVal == -1) { 1377 expectedTagVal = 0; 1378 } 1379 int32_t line = t->srcLine->elementAti(bp); 1380 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus(); 1381 if (rs != expectedTagVal) { 1382 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n" 1383 " Actual, Expected status = %4d, %4d", 1384 bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal); 1385 } 1386 } 1387 1388 1389 prevBP = bp; 1390 } 1391 1392 // Verify that there were no missed expected breaks after the last one found 1393 for (i=prevBP+1; i<t->expectedBreaks->size(); i++) { 1394 if (t->expectedBreaks->elementAti(i) != 0) { 1395 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1396 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 1397 } 1398 } 1399 1400 // 1401 // Run the iterator backwards, verify that the same breaks are found. 1402 // 1403 prevBP = t->dataToBreak.length()+2; // start with a phony value for the last break pos seen. 1404 for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) { 1405 if (prevBP == bp) { 1406 // Fail for lack of progress. 1407 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d", 1408 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 1409 break; 1410 } 1411 1412 // Check that there were we didn't miss an expected break between the last one 1413 // and this one. (UVector returns zeros for index out of bounds.) 1414 for (i=prevBP-1; i>bp; i--) { 1415 if (t->expectedBreaks->elementAti(i) != 0) { 1416 errln("Reverse Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1417 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 1418 } 1419 } 1420 1421 // Check that the break we did find was expected 1422 if (t->expectedBreaks->elementAti(bp) == 0) { 1423 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d", 1424 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 1425 } else { 1426 // The break was expected. 1427 // Check that the {nnn} tag value is correct. 1428 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp); 1429 if (expectedTagVal == -1) { 1430 expectedTagVal = 0; 1431 } 1432 int line = t->srcLine->elementAti(bp); 1433 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus(); 1434 if (rs != expectedTagVal) { 1435 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n" 1436 " Actual, Expected status = %4d, %4d", 1437 bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal); 1438 } 1439 } 1440 1441 prevBP = bp; 1442 } 1443 1444 // Verify that there were no missed breaks prior to the last one found 1445 for (i=prevBP-1; i>=0; i--) { 1446 if (t->expectedBreaks->elementAti(i) != 0) { 1447 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1448 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 1449 } 1450 } 1451 } 1452 1453 1454 void RBBITest::TestExtended() { 1455 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 1456 UErrorCode status = U_ZERO_ERROR; 1457 Locale locale(""); 1458 1459 UnicodeString rules; 1460 TestParams tp; 1461 tp.bi = NULL; 1462 tp.expectedBreaks = new UVector32(status); 1463 tp.srcLine = new UVector32(status); 1464 tp.srcCol = new UVector32(status); 1465 1466 RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status); 1467 if (U_FAILURE(status)) { 1468 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status)); 1469 } 1470 1471 1472 // 1473 // Open and read the test data file. 1474 // 1475 const char *testDataDirectory = IntlTest::getSourceTestData(status); 1476 char testFileName[1000]; 1477 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) { 1478 errln("Can't open test data. Path too long."); 1479 return; 1480 } 1481 strcpy(testFileName, testDataDirectory); 1482 strcat(testFileName, "rbbitst.txt"); 1483 1484 int len; 1485 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status); 1486 if (U_FAILURE(status)) { 1487 return; /* something went wrong, error already output */ 1488 } 1489 1490 1491 1492 1493 // 1494 // Put the test data into a UnicodeString 1495 // 1496 UnicodeString testString(FALSE, testFile, len); 1497 1498 enum EParseState{ 1499 PARSE_COMMENT, 1500 PARSE_TAG, 1501 PARSE_DATA, 1502 PARSE_NUM 1503 } 1504 parseState = PARSE_TAG; 1505 1506 EParseState savedState = PARSE_TAG; 1507 1508 static const UChar CH_LF = 0x0a; 1509 static const UChar CH_CR = 0x0d; 1510 static const UChar CH_HASH = 0x23; 1511 /*static const UChar CH_PERIOD = 0x2e;*/ 1512 static const UChar CH_LT = 0x3c; 1513 static const UChar CH_GT = 0x3e; 1514 static const UChar CH_BACKSLASH = 0x5c; 1515 static const UChar CH_BULLET = 0x2022; 1516 1517 int32_t lineNum = 1; 1518 int32_t colStart = 0; 1519 int32_t column = 0; 1520 int32_t charIdx = 0; 1521 1522 int32_t tagValue = 0; // The numeric value of a <nnn> tag. 1523 1524 for (charIdx = 0; charIdx < len; ) { 1525 status = U_ZERO_ERROR; 1526 UChar c = testString.charAt(charIdx); 1527 charIdx++; 1528 if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) { 1529 // treat CRLF as a unit 1530 c = CH_LF; 1531 charIdx++; 1532 } 1533 if (c == CH_LF || c == CH_CR) { 1534 lineNum++; 1535 colStart = charIdx; 1536 } 1537 column = charIdx - colStart + 1; 1538 1539 switch (parseState) { 1540 case PARSE_COMMENT: 1541 if (c == 0x0a || c == 0x0d) { 1542 parseState = savedState; 1543 } 1544 break; 1545 1546 case PARSE_TAG: 1547 { 1548 if (c == CH_HASH) { 1549 parseState = PARSE_COMMENT; 1550 savedState = PARSE_TAG; 1551 break; 1552 } 1553 if (u_isUWhiteSpace(c)) { 1554 break; 1555 } 1556 if (testString.compare(charIdx-1, 6, "<word>") == 0) { 1557 delete tp.bi; 1558 tp.bi = BreakIterator::createWordInstance(locale, status); 1559 charIdx += 5; 1560 break; 1561 } 1562 if (testString.compare(charIdx-1, 6, "<char>") == 0) { 1563 delete tp.bi; 1564 tp.bi = BreakIterator::createCharacterInstance(locale, status); 1565 charIdx += 5; 1566 break; 1567 } 1568 if (testString.compare(charIdx-1, 6, "<line>") == 0) { 1569 delete tp.bi; 1570 tp.bi = BreakIterator::createLineInstance(locale, status); 1571 charIdx += 5; 1572 break; 1573 } 1574 if (testString.compare(charIdx-1, 6, "<sent>") == 0) { 1575 delete tp.bi; 1576 tp.bi = NULL; 1577 tp.bi = BreakIterator::createSentenceInstance(locale, status); 1578 charIdx += 5; 1579 break; 1580 } 1581 if (testString.compare(charIdx-1, 7, "<title>") == 0) { 1582 delete tp.bi; 1583 tp.bi = BreakIterator::createTitleInstance(locale, status); 1584 charIdx += 6; 1585 break; 1586 } 1587 1588 // <locale loc_name> 1589 localeMatcher.reset(testString); 1590 if (localeMatcher.lookingAt(charIdx-1, status)) { 1591 UnicodeString localeName = localeMatcher.group(1, status); 1592 char localeName8[100]; 1593 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0); 1594 locale = Locale::createFromName(localeName8); 1595 charIdx += localeMatcher.group(0, status).length(); 1596 TEST_ASSERT_SUCCESS(status); 1597 break; 1598 } 1599 if (testString.compare(charIdx-1, 6, "<data>") == 0) { 1600 parseState = PARSE_DATA; 1601 charIdx += 5; 1602 tp.dataToBreak = ""; 1603 tp.expectedBreaks->removeAllElements(); 1604 tp.srcCol ->removeAllElements(); 1605 tp.srcLine->removeAllElements(); 1606 break; 1607 } 1608 1609 errln("line %d: Tag expected in test file.", lineNum); 1610 parseState = PARSE_COMMENT; 1611 savedState = PARSE_DATA; 1612 goto end_test; // Stop the test. 1613 } 1614 break; 1615 1616 case PARSE_DATA: 1617 if (c == CH_BULLET) { 1618 int32_t breakIdx = tp.dataToBreak.length(); 1619 tp.expectedBreaks->setSize(breakIdx+1); 1620 tp.expectedBreaks->setElementAt(-1, breakIdx); 1621 tp.srcLine->setSize(breakIdx+1); 1622 tp.srcLine->setElementAt(lineNum, breakIdx); 1623 tp.srcCol ->setSize(breakIdx+1); 1624 tp.srcCol ->setElementAt(column, breakIdx); 1625 break; 1626 } 1627 1628 if (testString.compare(charIdx-1, 7, "</data>") == 0) { 1629 // Add final entry to mappings from break location to source file position. 1630 // Need one extra because last break position returned is after the 1631 // last char in the data, not at the last char. 1632 tp.srcLine->addElement(lineNum, status); 1633 tp.srcCol ->addElement(column, status); 1634 1635 parseState = PARSE_TAG; 1636 charIdx += 6; 1637 1638 // RUN THE TEST! 1639 executeTest(&tp); 1640 break; 1641 } 1642 1643 if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) { 1644 // Named character, e.g. \N{COMBINING GRAVE ACCENT} 1645 // Get the code point from the name and insert it into the test data. 1646 // (Damn, no API takes names in Unicode !!! 1647 // we've got to take it back to char *) 1648 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx); 1649 int32_t nameLength = nameEndIdx - (charIdx+2); 1650 char charNameBuf[200]; 1651 UChar32 theChar = -1; 1652 if (nameEndIdx != -1) { 1653 UErrorCode status = U_ZERO_ERROR; 1654 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf)); 1655 charNameBuf[sizeof(charNameBuf)-1] = 0; 1656 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status); 1657 if (U_FAILURE(status)) { 1658 theChar = -1; 1659 } 1660 } 1661 if (theChar == -1) { 1662 errln("Error in named character in test file at line %d, col %d", 1663 lineNum, column); 1664 } else { 1665 // Named code point was recognized. Insert it 1666 // into the test data. 1667 tp.dataToBreak.append(theChar); 1668 while (tp.dataToBreak.length() > tp.srcLine->size()) { 1669 tp.srcLine->addElement(lineNum, status); 1670 tp.srcCol ->addElement(column, status); 1671 } 1672 } 1673 if (nameEndIdx > charIdx) { 1674 charIdx = nameEndIdx+1; 1675 1676 } 1677 break; 1678 } 1679 1680 1681 1682 1683 if (testString.compare(charIdx-1, 2, "<>") == 0) { 1684 charIdx++; 1685 int32_t breakIdx = tp.dataToBreak.length(); 1686 tp.expectedBreaks->setSize(breakIdx+1); 1687 tp.expectedBreaks->setElementAt(-1, breakIdx); 1688 tp.srcLine->setSize(breakIdx+1); 1689 tp.srcLine->setElementAt(lineNum, breakIdx); 1690 tp.srcCol ->setSize(breakIdx+1); 1691 tp.srcCol ->setElementAt(column, breakIdx); 1692 break; 1693 } 1694 1695 if (c == CH_LT) { 1696 tagValue = 0; 1697 parseState = PARSE_NUM; 1698 break; 1699 } 1700 1701 if (c == CH_HASH && column==3) { // TODO: why is column off so far? 1702 parseState = PARSE_COMMENT; 1703 savedState = PARSE_DATA; 1704 break; 1705 } 1706 1707 if (c == CH_BACKSLASH) { 1708 // Check for \ at end of line, a line continuation. 1709 // Advance over (discard) the newline 1710 UChar32 cp = testString.char32At(charIdx); 1711 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) { 1712 // We have a CR LF 1713 // Need an extra increment of the input ptr to move over both of them 1714 charIdx++; 1715 } 1716 if (cp == CH_LF || cp == CH_CR) { 1717 lineNum++; 1718 colStart = charIdx; 1719 charIdx++; 1720 break; 1721 } 1722 1723 // Let unescape handle the back slash. 1724 cp = testString.unescapeAt(charIdx); 1725 if (cp != -1) { 1726 // Escape sequence was recognized. Insert the char 1727 // into the test data. 1728 tp.dataToBreak.append(cp); 1729 while (tp.dataToBreak.length() > tp.srcLine->size()) { 1730 tp.srcLine->addElement(lineNum, status); 1731 tp.srcCol ->addElement(column, status); 1732 } 1733 break; 1734 } 1735 1736 1737 // Not a recognized backslash escape sequence. 1738 // Take the next char as a literal. 1739 // TODO: Should this be an error? 1740 c = testString.charAt(charIdx); 1741 charIdx = testString.moveIndex32(charIdx, 1); 1742 } 1743 1744 // Normal, non-escaped data char. 1745 tp.dataToBreak.append(c); 1746 1747 // Save the mapping from offset in the data to line/column numbers in 1748 // the original input file. Will be used for better error messages only. 1749 // If there's an expected break before this char, the slot in the mapping 1750 // vector will already be set for this char; don't overwrite it. 1751 if (tp.dataToBreak.length() > tp.srcLine->size()) { 1752 tp.srcLine->addElement(lineNum, status); 1753 tp.srcCol ->addElement(column, status); 1754 } 1755 break; 1756 1757 1758 case PARSE_NUM: 1759 // We are parsing an expected numeric tag value, like <1234>, 1760 // within a chunk of data. 1761 if (u_isUWhiteSpace(c)) { 1762 break; 1763 } 1764 1765 if (c == CH_GT) { 1766 // Finished the number. Add the info to the expected break data, 1767 // and switch parse state back to doing plain data. 1768 parseState = PARSE_DATA; 1769 if (tagValue == 0) { 1770 tagValue = -1; 1771 } 1772 int32_t breakIdx = tp.dataToBreak.length(); 1773 tp.expectedBreaks->setSize(breakIdx+1); 1774 tp.expectedBreaks->setElementAt(tagValue, breakIdx); 1775 tp.srcLine->setSize(breakIdx+1); 1776 tp.srcLine->setElementAt(lineNum, breakIdx); 1777 tp.srcCol ->setSize(breakIdx+1); 1778 tp.srcCol ->setElementAt(column, breakIdx); 1779 break; 1780 } 1781 1782 if (u_isdigit(c)) { 1783 tagValue = tagValue*10 + u_charDigitValue(c); 1784 break; 1785 } 1786 1787 errln("Syntax Error in test file at line %d, col %d", 1788 lineNum, column); 1789 parseState = PARSE_COMMENT; 1790 goto end_test; // Stop the test 1791 break; 1792 } 1793 1794 1795 if (U_FAILURE(status)) { 1796 errln("ICU Error %s while parsing test file at line %d.", 1797 u_errorName(status), lineNum); 1798 status = U_ZERO_ERROR; 1799 goto end_test; // Stop the test 1800 } 1801 1802 } 1803 1804 end_test: 1805 delete tp.bi; 1806 delete tp.expectedBreaks; 1807 delete tp.srcLine; 1808 delete tp.srcCol; 1809 delete [] testFile; 1810 #endif 1811 } 1812 1813 void RBBITest::TestThaiBreaks() { 1814 UErrorCode status=U_ZERO_ERROR; 1815 BreakIterator* b; 1816 Locale locale = Locale("th"); 1817 int32_t p, index; 1818 UChar c[]= { 1819 0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B, 1820 0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19, 1821 0x0E16, 0x0E49, 0x0E33, 0x0000 1822 }; 1823 int32_t expectedWordResult[] = { 1824 2, 3, 6, 10, 11, 15, 17, 20, 22 1825 }; 1826 int32_t expectedLineResult[] = { 1827 3, 6, 11, 15, 17, 20, 22 1828 }; 1829 1830 int32_t size = u_strlen(c); 1831 UnicodeString text=UnicodeString(c); 1832 1833 b = BreakIterator::createWordInstance(locale, status); 1834 if (U_FAILURE(status)) { 1835 errcheckln(status, "Unable to create thai word break iterator. - %s", u_errorName(status)); 1836 return; 1837 } 1838 b->setText(text); 1839 p = index = 0; 1840 while ((p=b->next())!=BreakIterator::DONE && p < size) { 1841 if (p != expectedWordResult[index++]) { 1842 errln("Incorrect break given by thai word break iterator. Expected: %d Got: %d", expectedWordResult[index-1], p); 1843 } 1844 } 1845 delete b; 1846 1847 b = BreakIterator::createLineInstance(locale, status); 1848 if (U_FAILURE(status)) { 1849 printf("Unable to create thai line break iterator.\n"); 1850 return; 1851 } 1852 b->setText(text); 1853 p = index = 0; 1854 while ((p=b->next())!=BreakIterator::DONE && p < size) { 1855 if (p != expectedLineResult[index++]) { 1856 errln("Incorrect break given by thai line break iterator. Expected: %d Got: %d", expectedLineResult[index-1], p); 1857 } 1858 } 1859 1860 delete b; 1861 } 1862 1863 // UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX" 1864 // Words don't include colon or period (cldrbug #1969). 1865 static const char posxWordText[] = "Can't have breaks in xx:yy or struct.field for CS-types."; 1866 static const int32_t posxWordTOffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 23, 24, 26, 27, 29, 30, 36, 37, 42, 43, 46, 47, 49, 50, 55, 56 }; 1867 static const int32_t posxWordROffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 26, 27, 29, 30, 42, 43, 46, 47, 49, 50, 55, 56 }; 1868 1869 // UBreakIteratorType UBRK_WORD, Locale "ja" 1870 // Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009). 1871 static const char jaWordText[] = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF" 1872 "\\u304C\\u3042\\u308B\\u3002\\u5948\\u3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002"; 1873 static const int32_t jaWordTOffsets[] = { 2, 3, 7, 8, 14, 17, 18, 20, 21, 24, 27, 28 }; 1874 static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28 }; 1875 1876 // UBreakIteratorType UBRK_SENTENCE, Locale "el" 1877 // Add break after Greek question mark (cldrbug #2069). 1878 static const char elSentText[] = "\\u0391\\u03B2, \\u03B3\\u03B4; \\u0395 \\u03B6\\u03B7\\u037E \\u0398 \\u03B9\\u03BA. " 1879 "\\u039B\\u03BC \\u03BD\\u03BE! \\u039F\\u03C0, \\u03A1\\u03C2? \\u03A3"; 1880 static const int32_t elSentTOffsets[] = { 8, 14, 20, 27, 35, 36 }; 1881 static const int32_t elSentROffsets[] = { 20, 27, 35, 36 }; 1882 1883 // UBreakIteratorType UBRK_CHARACTER, Locale "th" 1884 // Clusters should not include spacing Thai/Lao vowels (prefix or postfix), except for [SARA] AM (cldrbug #2161). 1885 static const char thCharText[] = "\\u0E01\\u0E23\\u0E30\\u0E17\\u0E48\\u0E2D\\u0E21\\u0E23\\u0E08\\u0E19\\u0E32 " 1886 "(\\u0E2A\\u0E38\\u0E0A\\u0E32\\u0E15\\u0E34-\\u0E08\\u0E38\\u0E11\\u0E32\\u0E21\\u0E32\\u0E28) " 1887 "\\u0E40\\u0E14\\u0E47\\u0E01\\u0E21\\u0E35\\u0E1B\\u0E31\\u0E0D\\u0E2B\\u0E32 "; 1888 static const int32_t thCharTOffsets[] = { 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 1889 12, 13, 15, 16, 17, 19, 20, 22, 23, 24, 25, 26, 27, 28, 1890 29, 30, 32, 33, 35, 37, 38, 39, 40, 41 }; 1891 static const int32_t thCharROffsets[] = { 1, 3, 5, 6, 7, 8, 9, 11, 1892 12, 13, 15, 17, 19, 20, 22, 24, 26, 27, 28, 1893 29, 32, 33, 35, 37, 38, 40, 41 }; 1894 1895 typedef struct { 1896 UBreakIteratorType type; 1897 const char * locale; 1898 const char * escapedText; 1899 const int32_t * tailoredOffsets; 1900 int32_t tailoredOffsetsCount; 1901 const int32_t * rootOffsets; 1902 int32_t rootOffsetsCount; 1903 } TailoredBreakItem; 1904 1905 #define ARRAY_PTR_LEN(array) (array),(sizeof(array)/sizeof(array[0])) 1906 1907 static const TailoredBreakItem tbItems[] = { 1908 { UBRK_WORD, "en_US_POSIX", posxWordText, ARRAY_PTR_LEN(posxWordTOffsets), ARRAY_PTR_LEN(posxWordROffsets) }, 1909 { UBRK_WORD, "ja", jaWordText, ARRAY_PTR_LEN(jaWordTOffsets), ARRAY_PTR_LEN(jaWordROffsets) }, 1910 { UBRK_SENTENCE, "el", elSentText, ARRAY_PTR_LEN(elSentTOffsets), ARRAY_PTR_LEN(elSentROffsets) }, 1911 { UBRK_CHARACTER, "th", thCharText, ARRAY_PTR_LEN(thCharTOffsets), ARRAY_PTR_LEN(thCharROffsets) }, 1912 { UBRK_CHARACTER, NULL, NULL, NULL,0, NULL,0 } // terminator 1913 }; 1914 1915 static void formatOffsets(char* buffer, int32_t buflen, int32_t count, const int32_t* offsets) { 1916 while (count-- > 0) { 1917 int writeCount; 1918 sprintf(buffer, /* buflen, */ " %d%n", *offsets++, &writeCount); /* wants to be snprintf */ 1919 buffer += writeCount; 1920 buflen -= writeCount; 1921 } 1922 } 1923 1924 enum { kMaxOffsetCount = 128 }; 1925 1926 void RBBITest::TBTest(BreakIterator* brkitr, int type, const char *locale, const char* escapedText, const int32_t *expectOffsets, int32_t expectOffsetsCount) { 1927 brkitr->setText( CharsToUnicodeString(escapedText) ); 1928 int32_t foundOffsets[kMaxOffsetCount]; 1929 int32_t offset, foundOffsetsCount = 0; 1930 // do forwards iteration test 1931 while ( foundOffsetsCount < kMaxOffsetCount && (offset = brkitr->next()) != BreakIterator::DONE ) { 1932 foundOffsets[foundOffsetsCount++] = offset; 1933 } 1934 if ( foundOffsetsCount != expectOffsetsCount || memcmp(expectOffsets, foundOffsets, foundOffsetsCount*sizeof(foundOffsets[0])) != 0 ) { 1935 // log error for forwards test 1936 char formatExpect[512], formatFound[512]; 1937 formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets); 1938 formatOffsets(formatFound, sizeof(formatFound), foundOffsetsCount, foundOffsets); 1939 errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found %d offsets fwd:%s\n", 1940 type, locale, escapedText, expectOffsetsCount, formatExpect, foundOffsetsCount, formatFound); 1941 } else { 1942 // do backwards iteration test 1943 --foundOffsetsCount; // back off one from the end offset 1944 while ( foundOffsetsCount > 0 ) { 1945 offset = brkitr->previous(); 1946 if ( offset != foundOffsets[--foundOffsetsCount] ) { 1947 // log error for backwards test 1948 char formatExpect[512]; 1949 formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets); 1950 errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found rev offset %d where expect %d\n", 1951 type, locale, escapedText, expectOffsetsCount, formatExpect, offset, foundOffsets[foundOffsetsCount]); 1952 break; 1953 } 1954 } 1955 } 1956 } 1957 1958 void RBBITest::TestTailoredBreaks() { 1959 const TailoredBreakItem * tbItemPtr; 1960 Locale rootLocale = Locale("root"); 1961 for (tbItemPtr = tbItems; tbItemPtr->escapedText != NULL; ++tbItemPtr) { 1962 Locale testLocale = Locale(tbItemPtr->locale); 1963 BreakIterator * tailoredBrkiter = NULL; 1964 BreakIterator * rootBrkiter = NULL; 1965 UErrorCode status = U_ZERO_ERROR; 1966 switch (tbItemPtr->type) { 1967 case UBRK_CHARACTER: 1968 tailoredBrkiter = BreakIterator::createCharacterInstance(testLocale, status); 1969 rootBrkiter = BreakIterator::createCharacterInstance(rootLocale, status); 1970 break; 1971 case UBRK_WORD: 1972 tailoredBrkiter = BreakIterator::createWordInstance(testLocale, status); 1973 rootBrkiter = BreakIterator::createWordInstance(rootLocale, status); 1974 break; 1975 case UBRK_LINE: 1976 tailoredBrkiter = BreakIterator::createLineInstance(testLocale, status); 1977 rootBrkiter = BreakIterator::createLineInstance(rootLocale, status); 1978 break; 1979 case UBRK_SENTENCE: 1980 tailoredBrkiter = BreakIterator::createSentenceInstance(testLocale, status); 1981 rootBrkiter = BreakIterator::createSentenceInstance(rootLocale, status); 1982 break; 1983 default: 1984 status = U_UNSUPPORTED_ERROR; 1985 break; 1986 } 1987 if (U_FAILURE(status)) { 1988 errcheckln(status, "BreakIterator create failed for type %d, locales root or %s - Error: %s", (int)(tbItemPtr->type), tbItemPtr->locale, u_errorName(status)); 1989 continue; 1990 } 1991 TBTest(tailoredBrkiter, (int)(tbItemPtr->type), tbItemPtr->locale, tbItemPtr->escapedText, tbItemPtr->tailoredOffsets, tbItemPtr->tailoredOffsetsCount); 1992 TBTest(rootBrkiter, (int)(tbItemPtr->type), "root", tbItemPtr->escapedText, tbItemPtr->rootOffsets, tbItemPtr->rootOffsetsCount); 1993 1994 delete rootBrkiter; 1995 delete tailoredBrkiter; 1996 } 1997 } 1998 1999 2000 //------------------------------------------------------------------------------- 2001 // 2002 // TestDictRules create a break iterator from source rules that includes a 2003 // dictionary range. Regression for bug #7130. Source rules 2004 // do not declare a break iterator type (word, line, sentence, etc. 2005 // but the dictionary code, without a type, would loop. 2006 // 2007 //------------------------------------------------------------------------------- 2008 void RBBITest::TestDictRules() { 2009 const char *rules = "$dictionary = [a-z]; \n" 2010 "!!forward; \n" 2011 "$dictionary $dictionary; \n" 2012 "!!reverse; \n" 2013 "$dictionary $dictionary; \n"; 2014 const char *text = "aa"; 2015 UErrorCode status = U_ZERO_ERROR; 2016 UParseError parseError; 2017 2018 RuleBasedBreakIterator bi(rules, parseError, status); 2019 if (U_SUCCESS(status)) { 2020 UnicodeString utext = text; 2021 bi.setText(utext); 2022 int32_t position; 2023 int32_t loops; 2024 for (loops = 0; loops<10; loops++) { 2025 position = bi.next(); 2026 if (position == RuleBasedBreakIterator::DONE) { 2027 break; 2028 } 2029 } 2030 TEST_ASSERT(loops == 1); 2031 } else { 2032 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status)); 2033 } 2034 } 2035 2036 2037 2038 //------------------------------------------------------------------------------- 2039 // 2040 // ReadAndConvertFile Read a text data file, convert it to UChars, and 2041 // return the datain one big UChar * buffer, which the caller must delete. 2042 // 2043 // parameters: 2044 // fileName: the name of the file, with no directory part. The test data directory 2045 // is assumed. 2046 // ulen an out parameter, receives the actual length (in UChars) of the file data. 2047 // encoding The file encoding. If the file contains a BOM, that will override the encoding 2048 // specified here. The BOM, if it exists, will be stripped from the returned data. 2049 // Pass NULL for the system default encoding. 2050 // status 2051 // returns: 2052 // The file data, converted to UChar. 2053 // The caller must delete this when done with 2054 // delete [] theBuffer; 2055 // 2056 // TODO: This is a clone of RegexTest::ReadAndConvertFile. 2057 // Move this function to some common place. 2058 // 2059 //-------------------------------------------------------------------------------- 2060 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) { 2061 UChar *retPtr = NULL; 2062 char *fileBuf = NULL; 2063 UConverter* conv = NULL; 2064 FILE *f = NULL; 2065 2066 ulen = 0; 2067 if (U_FAILURE(status)) { 2068 return retPtr; 2069 } 2070 2071 // 2072 // Open the file. 2073 // 2074 f = fopen(fileName, "rb"); 2075 if (f == 0) { 2076 dataerrln("Error opening test data file %s\n", fileName); 2077 status = U_FILE_ACCESS_ERROR; 2078 return NULL; 2079 } 2080 // 2081 // Read it in 2082 // 2083 int fileSize; 2084 int amt_read; 2085 2086 fseek( f, 0, SEEK_END); 2087 fileSize = ftell(f); 2088 fileBuf = new char[fileSize]; 2089 fseek(f, 0, SEEK_SET); 2090 amt_read = fread(fileBuf, 1, fileSize, f); 2091 if (amt_read != fileSize || fileSize <= 0) { 2092 errln("Error reading test data file."); 2093 goto cleanUpAndReturn; 2094 } 2095 2096 // 2097 // Look for a Unicode Signature (BOM) on the data just read 2098 // 2099 int32_t signatureLength; 2100 const char * fileBufC; 2101 const char* bomEncoding; 2102 2103 fileBufC = fileBuf; 2104 bomEncoding = ucnv_detectUnicodeSignature( 2105 fileBuf, fileSize, &signatureLength, &status); 2106 if(bomEncoding!=NULL ){ 2107 fileBufC += signatureLength; 2108 fileSize -= signatureLength; 2109 encoding = bomEncoding; 2110 } 2111 2112 // 2113 // Open a converter to take the rule file to UTF-16 2114 // 2115 conv = ucnv_open(encoding, &status); 2116 if (U_FAILURE(status)) { 2117 goto cleanUpAndReturn; 2118 } 2119 2120 // 2121 // Convert the rules to UChar. 2122 // Preflight first to determine required buffer size. 2123 // 2124 ulen = ucnv_toUChars(conv, 2125 NULL, // dest, 2126 0, // destCapacity, 2127 fileBufC, 2128 fileSize, 2129 &status); 2130 if (status == U_BUFFER_OVERFLOW_ERROR) { 2131 // Buffer Overflow is expected from the preflight operation. 2132 status = U_ZERO_ERROR; 2133 2134 retPtr = new UChar[ulen+1]; 2135 ucnv_toUChars(conv, 2136 retPtr, // dest, 2137 ulen+1, 2138 fileBufC, 2139 fileSize, 2140 &status); 2141 } 2142 2143 cleanUpAndReturn: 2144 fclose(f); 2145 delete []fileBuf; 2146 ucnv_close(conv); 2147 if (U_FAILURE(status)) { 2148 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 2149 delete retPtr; 2150 retPtr = 0; 2151 ulen = 0; 2152 }; 2153 return retPtr; 2154 } 2155 2156 2157 2158 //-------------------------------------------------------------------------------------------- 2159 // 2160 // Run tests from each of the boundary test data files distributed by the Unicode Consortium 2161 // 2162 //------------------------------------------------------------------------------------------- 2163 void RBBITest::TestUnicodeFiles() { 2164 RuleBasedBreakIterator *bi; 2165 UErrorCode status = U_ZERO_ERROR; 2166 2167 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status); 2168 TEST_ASSERT_SUCCESS(status); 2169 if (U_SUCCESS(status)) { 2170 runUnicodeTestData("GraphemeBreakTest.txt", bi); 2171 } 2172 delete bi; 2173 2174 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status); 2175 TEST_ASSERT_SUCCESS(status); 2176 if (U_SUCCESS(status)) { 2177 runUnicodeTestData("WordBreakTest.txt", bi); 2178 } 2179 delete bi; 2180 2181 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status); 2182 TEST_ASSERT_SUCCESS(status); 2183 if (U_SUCCESS(status)) { 2184 runUnicodeTestData("SentenceBreakTest.txt", bi); 2185 } 2186 delete bi; 2187 2188 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status); 2189 TEST_ASSERT_SUCCESS(status); 2190 if (U_SUCCESS(status)) { 2191 runUnicodeTestData("LineBreakTest.txt", bi); 2192 } 2193 delete bi; 2194 } 2195 2196 2197 //-------------------------------------------------------------------------------------------- 2198 // 2199 // Run tests from one of the boundary test data files distributed by the Unicode Consortium 2200 // 2201 //------------------------------------------------------------------------------------------- 2202 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) { 2203 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 2204 // TODO(andy): Match line break behavior to Unicode 6.0 and remove this time bomb. 2205 UVersionInfo icu4601 = { 4, 6, 0, 1 }; 2206 UBool isICUVersionPast46 = isICUVersionAtLeast(icu4601); 2207 UBool isLineBreak = 0 == strcmp(fileName, "LineBreakTest.txt"); 2208 UErrorCode status = U_ZERO_ERROR; 2209 2210 // 2211 // Open and read the test data file, put it into a UnicodeString. 2212 // 2213 const char *testDataDirectory = IntlTest::getSourceTestData(status); 2214 char testFileName[1000]; 2215 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) { 2216 dataerrln("Can't open test data. Path too long."); 2217 return; 2218 } 2219 strcpy(testFileName, testDataDirectory); 2220 strcat(testFileName, fileName); 2221 2222 logln("Opening data file %s\n", fileName); 2223 2224 int len; 2225 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status); 2226 if (status != U_FILE_ACCESS_ERROR) { 2227 TEST_ASSERT_SUCCESS(status); 2228 TEST_ASSERT(testFile != NULL); 2229 } 2230 if (U_FAILURE(status) || testFile == NULL) { 2231 return; /* something went wrong, error already output */ 2232 } 2233 UnicodeString testFileAsString(TRUE, testFile, len); 2234 2235 // 2236 // Parse the test data file using a regular expression. 2237 // Each kind of token is recognized in its own capture group; what type of item was scanned 2238 // is identified by which group had a match. 2239 // 2240 // Caputure Group # 1 2 3 4 5 2241 // Parses this item: divide x hex digits comment \n unrecognized \n 2242 // 2243 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV); 2244 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status); 2245 UnicodeString testString; 2246 UVector32 breakPositions(status); 2247 int lineNumber = 1; 2248 TEST_ASSERT_SUCCESS(status); 2249 if (U_FAILURE(status)) { 2250 return; 2251 } 2252 2253 // 2254 // Scan through each test case, building up the string to be broken in testString, 2255 // and the positions that should be boundaries in the breakPositions vector. 2256 // 2257 int spin = 0; 2258 while (tokenMatcher.find()) { 2259 if(tokenMatcher.hitEnd()) { 2260 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for. 2261 This occurred when the text file was corrupt (wasn't marked as UTF-8) 2262 and caused an infinite loop here on EBCDIC systems! 2263 */ 2264 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin); 2265 // return; 2266 } 2267 if (tokenMatcher.start(1, status) >= 0) { 2268 // Scanned a divide sign, indicating a break position in the test data. 2269 if (testString.length()>0) { 2270 breakPositions.addElement(testString.length(), status); 2271 } 2272 } 2273 else if (tokenMatcher.start(2, status) >= 0) { 2274 // Scanned an 'x', meaning no break at this position in the test data 2275 // Nothing to be done here. 2276 } 2277 else if (tokenMatcher.start(3, status) >= 0) { 2278 // Scanned Hex digits. Convert them to binary, append to the character data string. 2279 const UnicodeString &hexNumber = tokenMatcher.group(3, status); 2280 int length = hexNumber.length(); 2281 if (length<=8) { 2282 char buf[10]; 2283 hexNumber.extract (0, length, buf, sizeof(buf), US_INV); 2284 UChar32 c = (UChar32)strtol(buf, NULL, 16); 2285 if (c<=0x10ffff) { 2286 testString.append(c); 2287 } else { 2288 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n", 2289 fileName, lineNumber); 2290 } 2291 } else { 2292 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n", 2293 fileName, lineNumber); 2294 } 2295 } 2296 else if (tokenMatcher.start(4, status) >= 0) { 2297 // Scanned to end of a line, possibly skipping over a comment in the process. 2298 // If the line from the file contained test data, run the test now. 2299 // 2300 if (testString.length() > 0) { 2301 // TODO(andy): Remove this time bomb code. 2302 if (!isLineBreak || isICUVersionPast46 || !(4658 <= lineNumber && lineNumber <= 4758)) { 2303 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi); 2304 } 2305 } 2306 2307 // Clear out this test case. 2308 // The string and breakPositions vector will be refilled as the next 2309 // test case is parsed. 2310 testString.remove(); 2311 breakPositions.removeAllElements(); 2312 lineNumber++; 2313 } else { 2314 // Scanner catchall. Something unrecognized appeared on the line. 2315 char token[16]; 2316 UnicodeString uToken = tokenMatcher.group(0, status); 2317 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token)); 2318 token[sizeof(token)-1] = 0; 2319 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token); 2320 2321 // Clean up, in preparation for continuing with the next line. 2322 testString.remove(); 2323 breakPositions.removeAllElements(); 2324 lineNumber++; 2325 } 2326 TEST_ASSERT_SUCCESS(status); 2327 if (U_FAILURE(status)) { 2328 break; 2329 } 2330 } 2331 2332 delete [] testFile; 2333 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 2334 } 2335 2336 //-------------------------------------------------------------------------------------------- 2337 // 2338 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium 2339 // test data files. Do only a simple, forward-only check - 2340 // this test is mostly to check that ICU and the Unicode 2341 // data agree with each other. 2342 // 2343 //-------------------------------------------------------------------------------------------- 2344 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber, 2345 const UnicodeString &testString, // Text data to be broken 2346 UVector32 *breakPositions, // Positions where breaks should be found. 2347 RuleBasedBreakIterator *bi) { 2348 int32_t pos; // Break Position in the test string 2349 int32_t expectedI = 0; // Index of expected break position in the vector of expected results. 2350 int32_t expectedPos; // Expected break position (index into test string) 2351 2352 bi->setText(testString); 2353 pos = bi->first(); 2354 pos = bi->next(); 2355 2356 while (pos != BreakIterator::DONE) { 2357 if (expectedI >= breakPositions->size()) { 2358 errln("Test file \"%s\", line %d, unexpected break found at position %d", 2359 testFileName, lineNumber, pos); 2360 break; 2361 } 2362 expectedPos = breakPositions->elementAti(expectedI); 2363 if (pos < expectedPos) { 2364 errln("Test file \"%s\", line %d, unexpected break found at position %d", 2365 testFileName, lineNumber, pos); 2366 break; 2367 } 2368 if (pos > expectedPos) { 2369 errln("Test file \"%s\", line %d, failed to find expected break at position %d", 2370 testFileName, lineNumber, expectedPos); 2371 break; 2372 } 2373 pos = bi->next(); 2374 expectedI++; 2375 } 2376 2377 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) { 2378 errln("Test file \"%s\", line %d, failed to find expected break at position %d", 2379 testFileName, lineNumber, breakPositions->elementAti(expectedI)); 2380 } 2381 } 2382 2383 2384 2385 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 2386 //--------------------------------------------------------------------------------------- 2387 // 2388 // classs RBBIMonkeyKind 2389 // 2390 // Monkey Test for Break Iteration 2391 // Abstract interface class. Concrete derived classes independently 2392 // implement the break rules for different iterator types. 2393 // 2394 // The Monkey Test itself uses doesn't know which type of break iterator it is 2395 // testing, but works purely in terms of the interface defined here. 2396 // 2397 //--------------------------------------------------------------------------------------- 2398 class RBBIMonkeyKind { 2399 public: 2400 // Return a UVector of UnicodeSets, representing the character classes used 2401 // for this type of iterator. 2402 virtual UVector *charClasses() = 0; 2403 2404 // Set the test text on which subsequent calls to next() will operate 2405 virtual void setText(const UnicodeString &s) = 0; 2406 2407 // Find the next break postion, starting from the prev break position, or from zero. 2408 // Return -1 after reaching end of string. 2409 virtual int32_t next(int32_t i) = 0; 2410 2411 virtual ~RBBIMonkeyKind(); 2412 UErrorCode deferredStatus; 2413 2414 2415 protected: 2416 RBBIMonkeyKind(); 2417 2418 private: 2419 }; 2420 2421 RBBIMonkeyKind::RBBIMonkeyKind() { 2422 deferredStatus = U_ZERO_ERROR; 2423 } 2424 2425 RBBIMonkeyKind::~RBBIMonkeyKind() { 2426 } 2427 2428 2429 //---------------------------------------------------------------------------------------- 2430 // 2431 // Random Numbers. Similar to standard lib rand() and srand() 2432 // Not using library to 2433 // 1. Get same results on all platforms. 2434 // 2. Get access to current seed, to more easily reproduce failures. 2435 // 2436 //--------------------------------------------------------------------------------------- 2437 static uint32_t m_seed = 1; 2438 2439 static uint32_t m_rand() 2440 { 2441 m_seed = m_seed * 1103515245 + 12345; 2442 return (uint32_t)(m_seed/65536) % 32768; 2443 } 2444 2445 2446 //------------------------------------------------------------------------------------------ 2447 // 2448 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation 2449 // of RBBIMonkeyKind. 2450 // 2451 //------------------------------------------------------------------------------------------ 2452 class RBBICharMonkey: public RBBIMonkeyKind { 2453 public: 2454 RBBICharMonkey(); 2455 virtual ~RBBICharMonkey(); 2456 virtual UVector *charClasses(); 2457 virtual void setText(const UnicodeString &s); 2458 virtual int32_t next(int32_t i); 2459 private: 2460 UVector *fSets; 2461 2462 UnicodeSet *fCRLFSet; 2463 UnicodeSet *fControlSet; 2464 UnicodeSet *fExtendSet; 2465 UnicodeSet *fPrependSet; 2466 UnicodeSet *fSpacingSet; 2467 UnicodeSet *fLSet; 2468 UnicodeSet *fVSet; 2469 UnicodeSet *fTSet; 2470 UnicodeSet *fLVSet; 2471 UnicodeSet *fLVTSet; 2472 UnicodeSet *fHangulSet; 2473 UnicodeSet *fAnySet; 2474 2475 const UnicodeString *fText; 2476 }; 2477 2478 2479 RBBICharMonkey::RBBICharMonkey() { 2480 UErrorCode status = U_ZERO_ERROR; 2481 2482 fText = NULL; 2483 2484 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status); 2485 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status); 2486 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status); 2487 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status); 2488 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status); 2489 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status); 2490 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status); 2491 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status); 2492 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status); 2493 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status); 2494 fHangulSet = new UnicodeSet(); 2495 fHangulSet->addAll(*fLSet); 2496 fHangulSet->addAll(*fVSet); 2497 fHangulSet->addAll(*fTSet); 2498 fHangulSet->addAll(*fLVSet); 2499 fHangulSet->addAll(*fLVTSet); 2500 fAnySet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u0000-\\U0010ffff]"), status); 2501 2502 fSets = new UVector(status); 2503 fSets->addElement(fCRLFSet, status); 2504 fSets->addElement(fControlSet, status); 2505 fSets->addElement(fExtendSet, status); 2506 fSets->addElement(fPrependSet, status); 2507 fSets->addElement(fSpacingSet, status); 2508 fSets->addElement(fHangulSet, status); 2509 fSets->addElement(fAnySet, status); 2510 if (U_FAILURE(status)) { 2511 deferredStatus = status; 2512 } 2513 } 2514 2515 2516 void RBBICharMonkey::setText(const UnicodeString &s) { 2517 fText = &s; 2518 } 2519 2520 2521 2522 int32_t RBBICharMonkey::next(int32_t prevPos) { 2523 int p0, p1, p2, p3; // Indices of the significant code points around the 2524 // break position being tested. The candidate break 2525 // location is before p2. 2526 2527 int breakPos = -1; 2528 2529 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 2530 2531 if (U_FAILURE(deferredStatus)) { 2532 return -1; 2533 } 2534 2535 // Previous break at end of string. return DONE. 2536 if (prevPos >= fText->length()) { 2537 return -1; 2538 } 2539 p0 = p1 = p2 = p3 = prevPos; 2540 c3 = fText->char32At(prevPos); 2541 c0 = c1 = c2 = 0; 2542 2543 // Loop runs once per "significant" character position in the input text. 2544 for (;;) { 2545 // Move all of the positions forward in the input string. 2546 p0 = p1; c0 = c1; 2547 p1 = p2; c1 = c2; 2548 p2 = p3; c2 = c3; 2549 2550 // Advancd p3 by one codepoint 2551 p3 = fText->moveIndex32(p3, 1); 2552 c3 = fText->char32At(p3); 2553 2554 if (p1 == p2) { 2555 // Still warming up the loop. (won't work with zero length strings, but we don't care) 2556 continue; 2557 } 2558 if (p2 == fText->length()) { 2559 // Reached end of string. Always a break position. 2560 break; 2561 } 2562 2563 // Rule GB3 CR x LF 2564 // No Extend or Format characters may appear between the CR and LF, 2565 // which requires the additional check for p2 immediately following p1. 2566 // 2567 if (c1==0x0D && c2==0x0A && p1==(p2-1)) { 2568 continue; 2569 } 2570 2571 // Rule (GB4). ( Control | CR | LF ) <break> 2572 if (fControlSet->contains(c1) || 2573 c1 == 0x0D || 2574 c1 == 0x0A) { 2575 break; 2576 } 2577 2578 // Rule (GB5) <break> ( Control | CR | LF ) 2579 // 2580 if (fControlSet->contains(c2) || 2581 c2 == 0x0D || 2582 c2 == 0x0A) { 2583 break; 2584 } 2585 2586 2587 // Rule (GB6) L x ( L | V | LV | LVT ) 2588 if (fLSet->contains(c1) && 2589 (fLSet->contains(c2) || 2590 fVSet->contains(c2) || 2591 fLVSet->contains(c2) || 2592 fLVTSet->contains(c2))) { 2593 continue; 2594 } 2595 2596 // Rule (GB7) ( LV | V ) x ( V | T ) 2597 if ((fLVSet->contains(c1) || fVSet->contains(c1)) && 2598 (fVSet->contains(c2) || fTSet->contains(c2))) { 2599 continue; 2600 } 2601 2602 // Rule (GB8) ( LVT | T) x T 2603 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) && 2604 fTSet->contains(c2)) { 2605 continue; 2606 } 2607 2608 // Rule (GB9) Numeric x ALetter 2609 if (fExtendSet->contains(c2)) { 2610 continue; 2611 } 2612 2613 // Rule (GB9a) x SpacingMark 2614 if (fSpacingSet->contains(c2)) { 2615 continue; 2616 } 2617 2618 // Rule (GB9b) Prepend x 2619 if (fPrependSet->contains(c1)) { 2620 continue; 2621 } 2622 2623 // Rule (GB10) Any <break> Any 2624 break; 2625 } 2626 2627 breakPos = p2; 2628 return breakPos; 2629 } 2630 2631 2632 2633 UVector *RBBICharMonkey::charClasses() { 2634 return fSets; 2635 } 2636 2637 2638 RBBICharMonkey::~RBBICharMonkey() { 2639 delete fSets; 2640 delete fCRLFSet; 2641 delete fControlSet; 2642 delete fExtendSet; 2643 delete fPrependSet; 2644 delete fSpacingSet; 2645 delete fLSet; 2646 delete fVSet; 2647 delete fTSet; 2648 delete fLVSet; 2649 delete fLVTSet; 2650 delete fHangulSet; 2651 delete fAnySet; 2652 } 2653 2654 //------------------------------------------------------------------------------------------ 2655 // 2656 // class RBBIWordMonkey Word Break specific implementation 2657 // of RBBIMonkeyKind. 2658 // 2659 //------------------------------------------------------------------------------------------ 2660 class RBBIWordMonkey: public RBBIMonkeyKind { 2661 public: 2662 RBBIWordMonkey(); 2663 virtual ~RBBIWordMonkey(); 2664 virtual UVector *charClasses(); 2665 virtual void setText(const UnicodeString &s); 2666 virtual int32_t next(int32_t i); 2667 private: 2668 UVector *fSets; 2669 2670 UnicodeSet *fCRSet; 2671 UnicodeSet *fLFSet; 2672 UnicodeSet *fNewlineSet; 2673 UnicodeSet *fKatakanaSet; 2674 UnicodeSet *fALetterSet; 2675 UnicodeSet *fMidNumLetSet; 2676 UnicodeSet *fMidLetterSet; 2677 UnicodeSet *fMidNumSet; 2678 UnicodeSet *fNumericSet; 2679 UnicodeSet *fFormatSet; 2680 UnicodeSet *fOtherSet; 2681 UnicodeSet *fExtendSet; 2682 UnicodeSet *fExtendNumLetSet; 2683 2684 RegexMatcher *fMatcher; 2685 2686 const UnicodeString *fText; 2687 }; 2688 2689 2690 RBBIWordMonkey::RBBIWordMonkey() 2691 { 2692 UErrorCode status = U_ZERO_ERROR; 2693 2694 fSets = new UVector(status); 2695 2696 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status); 2697 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status); 2698 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status); 2699 fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status); 2700 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status); 2701 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status); 2702 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status); 2703 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status); 2704 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status); 2705 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status); 2706 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status); 2707 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status); 2708 2709 fOtherSet = new UnicodeSet(); 2710 if(U_FAILURE(status)) { 2711 deferredStatus = status; 2712 return; 2713 } 2714 2715 fOtherSet->complement(); 2716 fOtherSet->removeAll(*fCRSet); 2717 fOtherSet->removeAll(*fLFSet); 2718 fOtherSet->removeAll(*fNewlineSet); 2719 fOtherSet->removeAll(*fKatakanaSet); 2720 fOtherSet->removeAll(*fALetterSet); 2721 fOtherSet->removeAll(*fMidLetterSet); 2722 fOtherSet->removeAll(*fMidNumSet); 2723 fOtherSet->removeAll(*fNumericSet); 2724 fOtherSet->removeAll(*fExtendNumLetSet); 2725 fOtherSet->removeAll(*fFormatSet); 2726 fOtherSet->removeAll(*fExtendSet); 2727 // Inhibit dictionary characters from being tested at all. 2728 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status)); 2729 2730 fSets->addElement(fCRSet, status); 2731 fSets->addElement(fLFSet, status); 2732 fSets->addElement(fNewlineSet, status); 2733 fSets->addElement(fALetterSet, status); 2734 fSets->addElement(fKatakanaSet, status); 2735 fSets->addElement(fMidLetterSet, status); 2736 fSets->addElement(fMidNumLetSet, status); 2737 fSets->addElement(fMidNumSet, status); 2738 fSets->addElement(fNumericSet, status); 2739 fSets->addElement(fFormatSet, status); 2740 fSets->addElement(fExtendSet, status); 2741 fSets->addElement(fOtherSet, status); 2742 fSets->addElement(fExtendNumLetSet, status); 2743 2744 if (U_FAILURE(status)) { 2745 deferredStatus = status; 2746 } 2747 } 2748 2749 void RBBIWordMonkey::setText(const UnicodeString &s) { 2750 fText = &s; 2751 } 2752 2753 2754 int32_t RBBIWordMonkey::next(int32_t prevPos) { 2755 int p0, p1, p2, p3; // Indices of the significant code points around the 2756 // break position being tested. The candidate break 2757 // location is before p2. 2758 2759 int breakPos = -1; 2760 2761 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 2762 2763 if (U_FAILURE(deferredStatus)) { 2764 return -1; 2765 } 2766 2767 // Prev break at end of string. return DONE. 2768 if (prevPos >= fText->length()) { 2769 return -1; 2770 } 2771 p0 = p1 = p2 = p3 = prevPos; 2772 c3 = fText->char32At(prevPos); 2773 c0 = c1 = c2 = 0; 2774 2775 // Loop runs once per "significant" character position in the input text. 2776 for (;;) { 2777 // Move all of the positions forward in the input string. 2778 p0 = p1; c0 = c1; 2779 p1 = p2; c1 = c2; 2780 p2 = p3; c2 = c3; 2781 2782 // Advancd p3 by X(Extend | Format)* Rule 4 2783 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change) 2784 do { 2785 p3 = fText->moveIndex32(p3, 1); 2786 c3 = fText->char32At(p3); 2787 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { 2788 break; 2789 }; 2790 } 2791 while (fFormatSet->contains(c3) || fExtendSet->contains(c3)); 2792 2793 2794 if (p1 == p2) { 2795 // Still warming up the loop. (won't work with zero length strings, but we don't care) 2796 continue; 2797 } 2798 if (p2 == fText->length()) { 2799 // Reached end of string. Always a break position. 2800 break; 2801 } 2802 2803 // Rule (3) CR x LF 2804 // No Extend or Format characters may appear between the CR and LF, 2805 // which requires the additional check for p2 immediately following p1. 2806 // 2807 if (c1==0x0D && c2==0x0A) { 2808 continue; 2809 } 2810 2811 // Rule (3a) Break before and after newlines (including CR and LF) 2812 // 2813 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) { 2814 break; 2815 }; 2816 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { 2817 break; 2818 }; 2819 2820 // Rule (5). ALetter x ALetter 2821 if (fALetterSet->contains(c1) && 2822 fALetterSet->contains(c2)) { 2823 continue; 2824 } 2825 2826 // Rule (6) ALetter x (MidLetter | MidNumLet) ALetter 2827 // 2828 if ( fALetterSet->contains(c1) && 2829 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2)) && 2830 fALetterSet->contains(c3)) { 2831 continue; 2832 } 2833 2834 2835 // Rule (7) ALetter (MidLetter | MidNumLet) x ALetter 2836 if (fALetterSet->contains(c0) && 2837 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1)) && 2838 fALetterSet->contains(c2)) { 2839 continue; 2840 } 2841 2842 // Rule (8) Numeric x Numeric 2843 if (fNumericSet->contains(c1) && 2844 fNumericSet->contains(c2)) { 2845 continue; 2846 } 2847 2848 // Rule (9) ALetter x Numeric 2849 if (fALetterSet->contains(c1) && 2850 fNumericSet->contains(c2)) { 2851 continue; 2852 } 2853 2854 // Rule (10) Numeric x ALetter 2855 if (fNumericSet->contains(c1) && 2856 fALetterSet->contains(c2)) { 2857 continue; 2858 } 2859 2860 // Rule (11) Numeric (MidNum | MidNumLet) x Numeric 2861 if (fNumericSet->contains(c0) && 2862 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1)) && 2863 fNumericSet->contains(c2)) { 2864 continue; 2865 } 2866 2867 // Rule (12) Numeric x (MidNum | MidNumLet) Numeric 2868 if (fNumericSet->contains(c1) && 2869 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2)) && 2870 fNumericSet->contains(c3)) { 2871 continue; 2872 } 2873 2874 // Rule (13) Katakana x Katakana 2875 if (fKatakanaSet->contains(c1) && 2876 fKatakanaSet->contains(c2)) { 2877 continue; 2878 } 2879 2880 // Rule 13a 2881 if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) || 2882 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) && 2883 fExtendNumLetSet->contains(c2)) { 2884 continue; 2885 } 2886 2887 // Rule 13b 2888 if (fExtendNumLetSet->contains(c1) && 2889 (fALetterSet->contains(c2) || fNumericSet->contains(c2) || 2890 fKatakanaSet->contains(c2))) { 2891 continue; 2892 } 2893 2894 // Rule 14. Break found here. 2895 break; 2896 } 2897 2898 breakPos = p2; 2899 return breakPos; 2900 } 2901 2902 2903 UVector *RBBIWordMonkey::charClasses() { 2904 return fSets; 2905 } 2906 2907 2908 RBBIWordMonkey::~RBBIWordMonkey() { 2909 delete fSets; 2910 delete fCRSet; 2911 delete fLFSet; 2912 delete fNewlineSet; 2913 delete fKatakanaSet; 2914 delete fALetterSet; 2915 delete fMidNumLetSet; 2916 delete fMidLetterSet; 2917 delete fMidNumSet; 2918 delete fNumericSet; 2919 delete fFormatSet; 2920 delete fExtendSet; 2921 delete fExtendNumLetSet; 2922 delete fOtherSet; 2923 } 2924 2925 2926 2927 2928 //------------------------------------------------------------------------------------------ 2929 // 2930 // class RBBISentMonkey Sentence Break specific implementation 2931 // of RBBIMonkeyKind. 2932 // 2933 //------------------------------------------------------------------------------------------ 2934 class RBBISentMonkey: public RBBIMonkeyKind { 2935 public: 2936 RBBISentMonkey(); 2937 virtual ~RBBISentMonkey(); 2938 virtual UVector *charClasses(); 2939 virtual void setText(const UnicodeString &s); 2940 virtual int32_t next(int32_t i); 2941 private: 2942 int moveBack(int posFrom); 2943 int moveForward(int posFrom); 2944 UChar32 cAt(int pos); 2945 2946 UVector *fSets; 2947 2948 UnicodeSet *fSepSet; 2949 UnicodeSet *fFormatSet; 2950 UnicodeSet *fSpSet; 2951 UnicodeSet *fLowerSet; 2952 UnicodeSet *fUpperSet; 2953 UnicodeSet *fOLetterSet; 2954 UnicodeSet *fNumericSet; 2955 UnicodeSet *fATermSet; 2956 UnicodeSet *fSContinueSet; 2957 UnicodeSet *fSTermSet; 2958 UnicodeSet *fCloseSet; 2959 UnicodeSet *fOtherSet; 2960 UnicodeSet *fExtendSet; 2961 2962 const UnicodeString *fText; 2963 2964 }; 2965 2966 RBBISentMonkey::RBBISentMonkey() 2967 { 2968 UErrorCode status = U_ZERO_ERROR; 2969 2970 fSets = new UVector(status); 2971 2972 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator 2973 // set and made into character classes of their own. For the monkey impl, 2974 // they remain in SEP, since Sep always appears with CR and LF in the rules. 2975 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status); 2976 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status); 2977 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status); 2978 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status); 2979 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status); 2980 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status); 2981 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status); 2982 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status); 2983 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status); 2984 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status); 2985 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status); 2986 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status); 2987 fOtherSet = new UnicodeSet(); 2988 2989 if(U_FAILURE(status)) { 2990 deferredStatus = status; 2991 return; 2992 } 2993 2994 fOtherSet->complement(); 2995 fOtherSet->removeAll(*fSepSet); 2996 fOtherSet->removeAll(*fFormatSet); 2997 fOtherSet->removeAll(*fSpSet); 2998 fOtherSet->removeAll(*fLowerSet); 2999 fOtherSet->removeAll(*fUpperSet); 3000 fOtherSet->removeAll(*fOLetterSet); 3001 fOtherSet->removeAll(*fNumericSet); 3002 fOtherSet->removeAll(*fATermSet); 3003 fOtherSet->removeAll(*fSContinueSet); 3004 fOtherSet->removeAll(*fSTermSet); 3005 fOtherSet->removeAll(*fCloseSet); 3006 fOtherSet->removeAll(*fExtendSet); 3007 3008 fSets->addElement(fSepSet, status); 3009 fSets->addElement(fFormatSet, status); 3010 fSets->addElement(fSpSet, status); 3011 fSets->addElement(fLowerSet, status); 3012 fSets->addElement(fUpperSet, status); 3013 fSets->addElement(fOLetterSet, status); 3014 fSets->addElement(fNumericSet, status); 3015 fSets->addElement(fATermSet, status); 3016 fSets->addElement(fSContinueSet, status); 3017 fSets->addElement(fSTermSet, status); 3018 fSets->addElement(fCloseSet, status); 3019 fSets->addElement(fOtherSet, status); 3020 fSets->addElement(fExtendSet, status); 3021 3022 if (U_FAILURE(status)) { 3023 deferredStatus = status; 3024 } 3025 } 3026 3027 3028 3029 void RBBISentMonkey::setText(const UnicodeString &s) { 3030 fText = &s; 3031 } 3032 3033 UVector *RBBISentMonkey::charClasses() { 3034 return fSets; 3035 } 3036 3037 3038 // moveBack() Find the "significant" code point preceding the index i. 3039 // Skips over ($Extend | $Format)* . 3040 // 3041 int RBBISentMonkey::moveBack(int i) { 3042 if (i <= 0) { 3043 return -1; 3044 } 3045 UChar32 c; 3046 int32_t j = i; 3047 do { 3048 j = fText->moveIndex32(j, -1); 3049 c = fText->char32At(j); 3050 } 3051 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c))); 3052 return j; 3053 3054 } 3055 3056 3057 int RBBISentMonkey::moveForward(int i) { 3058 if (i>=fText->length()) { 3059 return fText->length(); 3060 } 3061 UChar32 c; 3062 int32_t j = i; 3063 do { 3064 j = fText->moveIndex32(j, 1); 3065 c = cAt(j); 3066 } 3067 while (fFormatSet->contains(c) || fExtendSet->contains(c)); 3068 return j; 3069 } 3070 3071 UChar32 RBBISentMonkey::cAt(int pos) { 3072 if (pos<0 || pos>=fText->length()) { 3073 return -1; 3074 } else { 3075 return fText->char32At(pos); 3076 } 3077 } 3078 3079 int32_t RBBISentMonkey::next(int32_t prevPos) { 3080 int p0, p1, p2, p3; // Indices of the significant code points around the 3081 // break position being tested. The candidate break 3082 // location is before p2. 3083 3084 int breakPos = -1; 3085 3086 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 3087 UChar32 c; 3088 3089 if (U_FAILURE(deferredStatus)) { 3090 return -1; 3091 } 3092 3093 // Prev break at end of string. return DONE. 3094 if (prevPos >= fText->length()) { 3095 return -1; 3096 } 3097 p0 = p1 = p2 = p3 = prevPos; 3098 c3 = fText->char32At(prevPos); 3099 c0 = c1 = c2 = 0; 3100 3101 // Loop runs once per "significant" character position in the input text. 3102 for (;;) { 3103 // Move all of the positions forward in the input string. 3104 p0 = p1; c0 = c1; 3105 p1 = p2; c1 = c2; 3106 p2 = p3; c2 = c3; 3107 3108 // Advancd p3 by X(Extend | Format)* Rule 4 3109 p3 = moveForward(p3); 3110 c3 = cAt(p3); 3111 3112 // Rule (3) CR x LF 3113 if (c1==0x0d && c2==0x0a && p2==(p1+1)) { 3114 continue; 3115 } 3116 3117 // Rule (4). Sep <break> 3118 if (fSepSet->contains(c1)) { 3119 p2 = p1+1; // Separators don't combine with Extend or Format. 3120 break; 3121 } 3122 3123 if (p2 >= fText->length()) { 3124 // Reached end of string. Always a break position. 3125 break; 3126 } 3127 3128 if (p2 == prevPos) { 3129 // Still warming up the loop. (won't work with zero length strings, but we don't care) 3130 continue; 3131 } 3132 3133 // Rule (6). ATerm x Numeric 3134 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) { 3135 continue; 3136 } 3137 3138 // Rule (7). Upper ATerm x Uppper 3139 if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) { 3140 continue; 3141 } 3142 3143 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower 3144 // Note: STerm | ATerm are added to the negated part of the expression by a 3145 // note to the Unicode 5.0 documents. 3146 int p8 = p1; 3147 while (fSpSet->contains(cAt(p8))) { 3148 p8 = moveBack(p8); 3149 } 3150 while (fCloseSet->contains(cAt(p8))) { 3151 p8 = moveBack(p8); 3152 } 3153 if (fATermSet->contains(cAt(p8))) { 3154 p8=p2; 3155 for (;;) { 3156 c = cAt(p8); 3157 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) || 3158 fLowerSet->contains(c) || fSepSet->contains(c) || 3159 fATermSet->contains(c) || fSTermSet->contains(c)) { 3160 break; 3161 } 3162 p8 = moveForward(p8); 3163 } 3164 if (fLowerSet->contains(cAt(p8))) { 3165 continue; 3166 } 3167 } 3168 3169 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm); 3170 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) { 3171 p8 = p1; 3172 while (fSpSet->contains(cAt(p8))) { 3173 p8 = moveBack(p8); 3174 } 3175 while (fCloseSet->contains(cAt(p8))) { 3176 p8 = moveBack(p8); 3177 } 3178 c = cAt(p8); 3179 if (fSTermSet->contains(c) || fATermSet->contains(c)) { 3180 continue; 3181 } 3182 } 3183 3184 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF) 3185 int p9 = p1; 3186 while (fCloseSet->contains(cAt(p9))) { 3187 p9 = moveBack(p9); 3188 } 3189 c = cAt(p9); 3190 if ((fSTermSet->contains(c) || fATermSet->contains(c))) { 3191 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) { 3192 continue; 3193 } 3194 } 3195 3196 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF) 3197 int p10 = p1; 3198 while (fSpSet->contains(cAt(p10))) { 3199 p10 = moveBack(p10); 3200 } 3201 while (fCloseSet->contains(cAt(p10))) { 3202 p10 = moveBack(p10); 3203 } 3204 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) { 3205 if (fSpSet->contains(c2) || fSepSet->contains(c2)) { 3206 continue; 3207 } 3208 } 3209 3210 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break> 3211 int p11 = p1; 3212 if (fSepSet->contains(cAt(p11))) { 3213 p11 = moveBack(p11); 3214 } 3215 while (fSpSet->contains(cAt(p11))) { 3216 p11 = moveBack(p11); 3217 } 3218 while (fCloseSet->contains(cAt(p11))) { 3219 p11 = moveBack(p11); 3220 } 3221 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) { 3222 break; 3223 } 3224 3225 // Rule (12) Any x Any 3226 continue; 3227 } 3228 breakPos = p2; 3229 return breakPos; 3230 } 3231 3232 RBBISentMonkey::~RBBISentMonkey() { 3233 delete fSets; 3234 delete fSepSet; 3235 delete fFormatSet; 3236 delete fSpSet; 3237 delete fLowerSet; 3238 delete fUpperSet; 3239 delete fOLetterSet; 3240 delete fNumericSet; 3241 delete fATermSet; 3242 delete fSContinueSet; 3243 delete fSTermSet; 3244 delete fCloseSet; 3245 delete fOtherSet; 3246 delete fExtendSet; 3247 } 3248 3249 3250 3251 //------------------------------------------------------------------------------------------- 3252 // 3253 // RBBILineMonkey 3254 // 3255 //------------------------------------------------------------------------------------------- 3256 3257 class RBBILineMonkey: public RBBIMonkeyKind { 3258 public: 3259 RBBILineMonkey(); 3260 virtual ~RBBILineMonkey(); 3261 virtual UVector *charClasses(); 3262 virtual void setText(const UnicodeString &s); 3263 virtual int32_t next(int32_t i); 3264 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar); 3265 private: 3266 UVector *fSets; 3267 3268 UnicodeSet *fBK; 3269 UnicodeSet *fCR; 3270 UnicodeSet *fLF; 3271 UnicodeSet *fCM; 3272 UnicodeSet *fNL; 3273 UnicodeSet *fSG; 3274 UnicodeSet *fWJ; 3275 UnicodeSet *fZW; 3276 UnicodeSet *fGL; 3277 UnicodeSet *fCB; 3278 UnicodeSet *fSP; 3279 UnicodeSet *fB2; 3280 UnicodeSet *fBA; 3281 UnicodeSet *fBB; 3282 UnicodeSet *fHY; 3283 UnicodeSet *fH2; 3284 UnicodeSet *fH3; 3285 UnicodeSet *fCL; 3286 UnicodeSet *fCP; 3287 UnicodeSet *fEX; 3288 UnicodeSet *fIN; 3289 UnicodeSet *fJL; 3290 UnicodeSet *fJV; 3291 UnicodeSet *fJT; 3292 UnicodeSet *fNS; 3293 UnicodeSet *fOP; 3294 UnicodeSet *fQU; 3295 UnicodeSet *fIS; 3296 UnicodeSet *fNU; 3297 UnicodeSet *fPO; 3298 UnicodeSet *fPR; 3299 UnicodeSet *fSY; 3300 UnicodeSet *fAI; 3301 UnicodeSet *fAL; 3302 UnicodeSet *fID; 3303 UnicodeSet *fSA; 3304 UnicodeSet *fXX; 3305 3306 BreakIterator *fCharBI; 3307 3308 const UnicodeString *fText; 3309 int32_t *fOrigPositions; 3310 3311 RegexMatcher *fNumberMatcher; 3312 RegexMatcher *fLB11Matcher; 3313 }; 3314 3315 3316 RBBILineMonkey::RBBILineMonkey() 3317 { 3318 UErrorCode status = U_ZERO_ERROR; 3319 3320 fSets = new UVector(status); 3321 3322 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status); 3323 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status); 3324 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status); 3325 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status); 3326 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status); 3327 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status); 3328 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status); 3329 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status); 3330 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status); 3331 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status); 3332 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status); 3333 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status); 3334 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status); 3335 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status); 3336 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status); 3337 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status); 3338 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status); 3339 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status); 3340 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status); 3341 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status); 3342 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status); 3343 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status); 3344 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status); 3345 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status); 3346 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status); 3347 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status); 3348 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status); 3349 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status); 3350 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status); 3351 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status); 3352 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status); 3353 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status); 3354 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status); 3355 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status); 3356 fSA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status); 3357 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status); 3358 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status); 3359 3360 if (U_FAILURE(status)) { 3361 deferredStatus = status; 3362 fCharBI = NULL; 3363 fNumberMatcher = NULL; 3364 return; 3365 } 3366 3367 fAL->addAll(*fXX); // Default behavior for XX is identical to AL 3368 fAL->addAll(*fAI); // Default behavior for AI is identical to AL 3369 fAL->addAll(*fSA); // Default behavior for SA is XX, which defaults to AL 3370 fAL->addAll(*fSG); // Default behavior for SG is identical to AL. 3371 3372 fSets->addElement(fBK, status); 3373 fSets->addElement(fCR, status); 3374 fSets->addElement(fLF, status); 3375 fSets->addElement(fCM, status); 3376 fSets->addElement(fNL, status); 3377 fSets->addElement(fWJ, status); 3378 fSets->addElement(fZW, status); 3379 fSets->addElement(fGL, status); 3380 fSets->addElement(fCB, status); 3381 fSets->addElement(fSP, status); 3382 fSets->addElement(fB2, status); 3383 fSets->addElement(fBA, status); 3384 fSets->addElement(fBB, status); 3385 fSets->addElement(fHY, status); 3386 fSets->addElement(fH2, status); 3387 fSets->addElement(fH3, status); 3388 fSets->addElement(fCL, status); 3389 fSets->addElement(fCP, status); 3390 fSets->addElement(fEX, status); 3391 fSets->addElement(fIN, status); 3392 fSets->addElement(fJL, status); 3393 fSets->addElement(fJT, status); 3394 fSets->addElement(fJV, status); 3395 fSets->addElement(fNS, status); 3396 fSets->addElement(fOP, status); 3397 fSets->addElement(fQU, status); 3398 fSets->addElement(fIS, status); 3399 fSets->addElement(fNU, status); 3400 fSets->addElement(fPO, status); 3401 fSets->addElement(fPR, status); 3402 fSets->addElement(fSY, status); 3403 fSets->addElement(fAI, status); 3404 fSets->addElement(fAL, status); 3405 fSets->addElement(fID, status); 3406 fSets->addElement(fWJ, status); 3407 fSets->addElement(fSA, status); 3408 fSets->addElement(fSG, status); 3409 3410 const char *rules = 3411 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?" 3412 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?" 3413 "\\p{Line_Break=NU}\\p{Line_Break=CM}*" 3414 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*" 3415 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?" 3416 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"; 3417 3418 fNumberMatcher = new RegexMatcher( 3419 UnicodeString(rules, -1, US_INV), 0, status); 3420 3421 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status); 3422 3423 if (U_FAILURE(status)) { 3424 deferredStatus = status; 3425 } 3426 } 3427 3428 3429 void RBBILineMonkey::setText(const UnicodeString &s) { 3430 fText = &s; 3431 fCharBI->setText(s); 3432 fNumberMatcher->reset(s); 3433 } 3434 3435 // 3436 // rule9Adjust 3437 // Line Break TR rules 9 and 10 implementation. 3438 // This deals with combining marks and other sequences that 3439 // that must be treated as if they were something other than what they actually are. 3440 // 3441 // This is factored out into a separate function because it must be applied twice for 3442 // each potential break, once to the chars before the position being checked, then 3443 // again to the text following the possible break. 3444 // 3445 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) { 3446 if (pos == -1) { 3447 // Invalid initial position. Happens during the warmup iteration of the 3448 // main loop in next(). 3449 return; 3450 } 3451 3452 int32_t nPos = *nextPos; 3453 3454 // LB 9 Keep combining sequences together. 3455 // advance over any CM class chars. Note that Line Break CM is different 3456 // from the normal Grapheme Extend property. 3457 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d || 3458 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) { 3459 for (;;) { 3460 *nextChar = fText->char32At(nPos); 3461 if (!fCM->contains(*nextChar)) { 3462 break; 3463 } 3464 nPos = fText->moveIndex32(nPos, 1); 3465 } 3466 } 3467 3468 3469 // LB 9 Treat X CM* as if it were x. 3470 // No explicit action required. 3471 3472 // LB 10 Treat any remaining combining mark as AL 3473 if (fCM->contains(*posChar)) { 3474 *posChar = 0x41; // thisChar = 'A'; 3475 } 3476 3477 // Push the updated nextPos and nextChar back to our caller. 3478 // This only makes a difference if posChar got bigger by consuming a 3479 // combining sequence. 3480 *nextPos = nPos; 3481 *nextChar = fText->char32At(nPos); 3482 } 3483 3484 3485 3486 int32_t RBBILineMonkey::next(int32_t startPos) { 3487 UErrorCode status = U_ZERO_ERROR; 3488 int32_t pos; // Index of the char following a potential break position 3489 UChar32 thisChar; // Character at above position "pos" 3490 3491 int32_t prevPos; // Index of the char preceding a potential break position 3492 UChar32 prevChar; // Character at above position. Note that prevChar 3493 // and thisChar may not be adjacent because combining 3494 // characters between them will be ignored. 3495 3496 int32_t nextPos; // Index of the next character following pos. 3497 // Usually skips over combining marks. 3498 int32_t nextCPPos; // Index of the code point following "pos." 3499 // May point to a combining mark. 3500 int32_t tPos; // temp value. 3501 UChar32 c; 3502 3503 if (U_FAILURE(deferredStatus)) { 3504 return -1; 3505 } 3506 3507 if (startPos >= fText->length()) { 3508 return -1; 3509 } 3510 3511 3512 // Initial values for loop. Loop will run the first time without finding breaks, 3513 // while the invalid values shift out and the "this" and 3514 // "prev" positions are filled in with good values. 3515 pos = prevPos = -1; // Invalid value, serves as flag for initial loop iteration. 3516 thisChar = prevChar = 0; 3517 nextPos = nextCPPos = startPos; 3518 3519 3520 // Loop runs once per position in the test text, until a break position 3521 // is found. 3522 for (;;) { 3523 prevPos = pos; 3524 prevChar = thisChar; 3525 3526 pos = nextPos; 3527 thisChar = fText->char32At(pos); 3528 3529 nextCPPos = fText->moveIndex32(pos, 1); 3530 nextPos = nextCPPos; 3531 3532 // Rule LB2 - Break at end of text. 3533 if (pos >= fText->length()) { 3534 break; 3535 } 3536 3537 // Rule LB 9 - adjust for combining sequences. 3538 // We do this one out-of-order because the adjustment does not change anything 3539 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to 3540 // be applied. 3541 rule9Adjust(prevPos, &prevChar, &pos, &thisChar); 3542 nextCPPos = nextPos = fText->moveIndex32(pos, 1); 3543 c = fText->char32At(nextPos); 3544 rule9Adjust(pos, &thisChar, &nextPos, &c); 3545 3546 // If the loop is still warming up - if we haven't shifted the initial 3547 // -1 positions out of prevPos yet - loop back to advance the 3548 // position in the input without any further looking for breaks. 3549 if (prevPos == -1) { 3550 continue; 3551 } 3552 3553 // LB 4 Always break after hard line breaks, 3554 if (fBK->contains(prevChar)) { 3555 break; 3556 } 3557 3558 // LB 5 Break after CR, LF, NL, but not inside CR LF 3559 if (prevChar == 0x0d && thisChar == 0x0a) { 3560 continue; 3561 } 3562 if (prevChar == 0x0d || 3563 prevChar == 0x0a || 3564 prevChar == 0x85) { 3565 break; 3566 } 3567 3568 // LB 6 Don't break before hard line breaks 3569 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 || 3570 fBK->contains(thisChar)) { 3571 continue; 3572 } 3573 3574 3575 // LB 7 Don't break before spaces or zero-width space. 3576 if (fSP->contains(thisChar)) { 3577 continue; 3578 } 3579 3580 if (fZW->contains(thisChar)) { 3581 continue; 3582 } 3583 3584 // LB 8 Break after zero width space 3585 if (fZW->contains(prevChar)) { 3586 break; 3587 } 3588 3589 // LB 9, 10 Already done, at top of loop. 3590 // 3591 3592 3593 // LB 11 Do not break before or after WORD JOINER and related characters. 3594 // x WJ 3595 // WJ x 3596 // 3597 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) { 3598 continue; 3599 } 3600 3601 // LB 12 3602 // GL x 3603 if (fGL->contains(prevChar)) { 3604 continue; 3605 } 3606 3607 // LB 12a 3608 // [^SP BA HY] x GL 3609 if (!(fSP->contains(prevChar) || 3610 fBA->contains(prevChar) || 3611 fHY->contains(prevChar) ) && fGL->contains(thisChar)) { 3612 continue; 3613 } 3614 3615 3616 3617 // LB 13 Don't break before closings. 3618 // NU x CL, NU x CP and NU x IS are not matched here so that they will 3619 // fall into LB 17 and the more general number regular expression. 3620 // 3621 if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) || 3622 (!fNU->contains(prevChar) && fCP->contains(thisChar)) || 3623 fEX->contains(thisChar) || 3624 (!fNU->contains(prevChar) && fIS->contains(thisChar)) || 3625 (!fNU->contains(prevChar) && fSY->contains(thisChar))) { 3626 continue; 3627 } 3628 3629 // LB 14 Don't break after OP SP* 3630 // Scan backwards, checking for this sequence. 3631 // The OP char could include combining marks, so we actually check for 3632 // OP CM* SP* 3633 // Another Twist: The Rule 67 fixes may have changed a SP CM 3634 // sequence into a ID char, so before scanning back through spaces, 3635 // verify that prevChar is indeed a space. The prevChar variable 3636 // may differ from fText[prevPos] 3637 tPos = prevPos; 3638 if (fSP->contains(prevChar)) { 3639 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) { 3640 tPos=fText->moveIndex32(tPos, -1); 3641 } 3642 } 3643 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { 3644 tPos=fText->moveIndex32(tPos, -1); 3645 } 3646 if (fOP->contains(fText->char32At(tPos))) { 3647 continue; 3648 } 3649 3650 3651 // LB 15 QU SP* x OP 3652 if (fOP->contains(thisChar)) { 3653 // Scan backwards from prevChar to see if it is preceded by QU CM* SP* 3654 int tPos = prevPos; 3655 while (tPos>0 && fSP->contains(fText->char32At(tPos))) { 3656 tPos = fText->moveIndex32(tPos, -1); 3657 } 3658 while (tPos>0 && fCM->contains(fText->char32At(tPos))) { 3659 tPos = fText->moveIndex32(tPos, -1); 3660 } 3661 if (fQU->contains(fText->char32At(tPos))) { 3662 continue; 3663 } 3664 } 3665 3666 3667 3668 // LB 16 (CL | CP) SP* x NS 3669 // Scan backwards for SP* CM* (CL | CP) 3670 if (fNS->contains(thisChar)) { 3671 int tPos = prevPos; 3672 while (tPos>0 && fSP->contains(fText->char32At(tPos))) { 3673 tPos = fText->moveIndex32(tPos, -1); 3674 } 3675 while (tPos>0 && fCM->contains(fText->char32At(tPos))) { 3676 tPos = fText->moveIndex32(tPos, -1); 3677 } 3678 if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) { 3679 continue; 3680 } 3681 } 3682 3683 3684 // LB 17 B2 SP* x B2 3685 if (fB2->contains(thisChar)) { 3686 // Scan backwards, checking for the B2 CM* SP* sequence. 3687 tPos = prevPos; 3688 if (fSP->contains(prevChar)) { 3689 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) { 3690 tPos=fText->moveIndex32(tPos, -1); 3691 } 3692 } 3693 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { 3694 tPos=fText->moveIndex32(tPos, -1); 3695 } 3696 if (fB2->contains(fText->char32At(tPos))) { 3697 continue; 3698 } 3699 } 3700 3701 3702 // LB 18 break after space 3703 if (fSP->contains(prevChar)) { 3704 break; 3705 } 3706 3707 // LB 19 3708 // x QU 3709 // QU x 3710 if (fQU->contains(thisChar) || fQU->contains(prevChar)) { 3711 continue; 3712 } 3713 3714 // LB 20 Break around a CB 3715 if (fCB->contains(thisChar) || fCB->contains(prevChar)) { 3716 break; 3717 } 3718 3719 // LB 21 3720 if (fBA->contains(thisChar) || 3721 fHY->contains(thisChar) || 3722 fNS->contains(thisChar) || 3723 fBB->contains(prevChar) ) { 3724 continue; 3725 } 3726 3727 // LB 22 3728 if ((fAL->contains(prevChar) && fIN->contains(thisChar)) || 3729 (fID->contains(prevChar) && fIN->contains(thisChar)) || 3730 (fIN->contains(prevChar) && fIN->contains(thisChar)) || 3731 (fNU->contains(prevChar) && fIN->contains(thisChar)) ) { 3732 continue; 3733 } 3734 3735 3736 // LB 23 ID x PO 3737 // AL x NU 3738 // NU x AL 3739 if ((fID->contains(prevChar) && fPO->contains(thisChar)) || 3740 (fAL->contains(prevChar) && fNU->contains(thisChar)) || 3741 (fNU->contains(prevChar) && fAL->contains(thisChar)) ) { 3742 continue; 3743 } 3744 3745 // LB 24 Do not break between prefix and letters or ideographs. 3746 // PR x ID 3747 // PR x AL 3748 // PO x AL 3749 if ((fPR->contains(prevChar) && fID->contains(thisChar)) || 3750 (fPR->contains(prevChar) && fAL->contains(thisChar)) || 3751 (fPO->contains(prevChar) && fAL->contains(thisChar)) ) { 3752 continue; 3753 } 3754 3755 3756 3757 // LB 25 Numbers 3758 if (fNumberMatcher->lookingAt(prevPos, status)) { 3759 if (U_FAILURE(status)) { 3760 break; 3761 } 3762 // Matched a number. But could have been just a single digit, which would 3763 // not represent a "no break here" between prevChar and thisChar 3764 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num 3765 if (numEndIdx > pos) { 3766 // Number match includes at least our two chars being checked 3767 if (numEndIdx > nextPos) { 3768 // Number match includes additional chars. Update pos and nextPos 3769 // so that next loop iteration will continue at the end of the number, 3770 // checking for breaks between last char in number & whatever follows. 3771 pos = nextPos = numEndIdx; 3772 do { 3773 pos = fText->moveIndex32(pos, -1); 3774 thisChar = fText->char32At(pos); 3775 } while (fCM->contains(thisChar)); 3776 } 3777 continue; 3778 } 3779 } 3780 3781 3782 // LB 26 Do not break a Korean syllable. 3783 if (fJL->contains(prevChar) && (fJL->contains(thisChar) || 3784 fJV->contains(thisChar) || 3785 fH2->contains(thisChar) || 3786 fH3->contains(thisChar))) { 3787 continue; 3788 } 3789 3790 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) && 3791 (fJV->contains(thisChar) || fJT->contains(thisChar))) { 3792 continue; 3793 } 3794 3795 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) && 3796 fJT->contains(thisChar)) { 3797 continue; 3798 } 3799 3800 // LB 27 Treat a Korean Syllable Block the same as ID. 3801 if ((fJL->contains(prevChar) || fJV->contains(prevChar) || 3802 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) && 3803 fIN->contains(thisChar)) { 3804 continue; 3805 } 3806 if ((fJL->contains(prevChar) || fJV->contains(prevChar) || 3807 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) && 3808 fPO->contains(thisChar)) { 3809 continue; 3810 } 3811 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) || 3812 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) { 3813 continue; 3814 } 3815 3816 3817 3818 // LB 28 Do not break between alphabetics ("at"). 3819 if (fAL->contains(prevChar) && fAL->contains(thisChar)) { 3820 continue; 3821 } 3822 3823 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g."). 3824 if (fIS->contains(prevChar) && fAL->contains(thisChar)) { 3825 continue; 3826 } 3827 3828 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation. 3829 // (AL | NU) x OP 3830 // CP x (AL | NU) 3831 if ((fAL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) { 3832 continue; 3833 } 3834 if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fNU->contains(thisChar))) { 3835 continue; 3836 } 3837 3838 // LB 31 Break everywhere else 3839 break; 3840 3841 } 3842 3843 return pos; 3844 } 3845 3846 3847 UVector *RBBILineMonkey::charClasses() { 3848 return fSets; 3849 } 3850 3851 3852 RBBILineMonkey::~RBBILineMonkey() { 3853 delete fSets; 3854 3855 delete fBK; 3856 delete fCR; 3857 delete fLF; 3858 delete fCM; 3859 delete fNL; 3860 delete fWJ; 3861 delete fZW; 3862 delete fGL; 3863 delete fCB; 3864 delete fSP; 3865 delete fB2; 3866 delete fBA; 3867 delete fBB; 3868 delete fHY; 3869 delete fH2; 3870 delete fH3; 3871 delete fCL; 3872 delete fCP; 3873 delete fEX; 3874 delete fIN; 3875 delete fJL; 3876 delete fJV; 3877 delete fJT; 3878 delete fNS; 3879 delete fOP; 3880 delete fQU; 3881 delete fIS; 3882 delete fNU; 3883 delete fPO; 3884 delete fPR; 3885 delete fSY; 3886 delete fAI; 3887 delete fAL; 3888 delete fID; 3889 delete fSA; 3890 delete fSG; 3891 delete fXX; 3892 3893 delete fCharBI; 3894 delete fNumberMatcher; 3895 } 3896 3897 3898 //------------------------------------------------------------------------------------------- 3899 // 3900 // TestMonkey 3901 // 3902 // params 3903 // seed=nnnnn Random number starting seed. 3904 // Setting the seed allows errors to be reproduced. 3905 // loop=nnn Looping count. Controls running time. 3906 // -1: run forever. 3907 // 0 or greater: run length. 3908 // 3909 // type = char | word | line | sent | title 3910 // 3911 //------------------------------------------------------------------------------------------- 3912 3913 static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) { 3914 int32_t val = defaultVal; 3915 name.append(" *= *(-?\\d+)"); 3916 UErrorCode status = U_ZERO_ERROR; 3917 RegexMatcher m(name, params, 0, status); 3918 if (m.find()) { 3919 // The param exists. Convert the string to an int. 3920 char valString[100]; 3921 int32_t paramLength = m.end(1, status) - m.start(1, status); 3922 if (paramLength >= (int32_t)(sizeof(valString)-1)) { 3923 paramLength = (int32_t)(sizeof(valString)-2); 3924 } 3925 params.extract(m.start(1, status), paramLength, valString, sizeof(valString)); 3926 val = strtol(valString, NULL, 10); 3927 3928 // Delete this parameter from the params string. 3929 m.reset(); 3930 params = m.replaceFirst("", status); 3931 } 3932 U_ASSERT(U_SUCCESS(status)); 3933 return val; 3934 } 3935 #endif 3936 3937 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr, 3938 BreakIterator *bi, 3939 int expected[], 3940 int expectedcount) 3941 { 3942 int count = 0; 3943 int i = 0; 3944 int forward[50]; 3945 bi->setText(ustr); 3946 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) { 3947 forward[count] = i; 3948 if (count < expectedcount && expected[count] != i) { 3949 test->errln("break forward test failed: expected %d but got %d", 3950 expected[count], i); 3951 break; 3952 } 3953 count ++; 3954 } 3955 if (count != expectedcount) { 3956 printStringBreaks(ustr, expected, expectedcount); 3957 test->errln("break forward test failed: missed %d match", 3958 expectedcount - count); 3959 return; 3960 } 3961 // testing boundaries 3962 for (i = 1; i < expectedcount; i ++) { 3963 int j = expected[i - 1]; 3964 if (!bi->isBoundary(j)) { 3965 printStringBreaks(ustr, expected, expectedcount); 3966 test->errln("isBoundary() failed. Expected boundary at position %d", j); 3967 return; 3968 } 3969 for (j = expected[i - 1] + 1; j < expected[i]; j ++) { 3970 if (bi->isBoundary(j)) { 3971 printStringBreaks(ustr, expected, expectedcount); 3972 test->errln("isBoundary() failed. Not expecting boundary at position %d", j); 3973 return; 3974 } 3975 } 3976 } 3977 3978 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) { 3979 count --; 3980 if (forward[count] != i) { 3981 test->errln("happy break test previous() failed: expected %d but got %d", 3982 forward[count], i); 3983 break; 3984 } 3985 } 3986 if (count != 0) { 3987 printStringBreaks(ustr, expected, expectedcount); 3988 test->errln("break test previous() failed: missed a match"); 3989 return; 3990 } 3991 3992 // testing preceding 3993 for (i = 0; i < expectedcount - 1; i ++) { 3994 // int j = expected[i] + 1; 3995 int j = ustr.moveIndex32(expected[i], 1); 3996 for (; j <= expected[i + 1]; j ++) { 3997 if (bi->preceding(j) != expected[i]) { 3998 printStringBreaks(ustr, expected, expectedcount); 3999 test->errln("preceding(): Not expecting boundary at position %d", j); 4000 return; 4001 } 4002 } 4003 } 4004 } 4005 4006 void RBBITest::TestWordBreaks(void) 4007 { 4008 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 4009 4010 Locale locale("en"); 4011 UErrorCode status = U_ZERO_ERROR; 4012 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 4013 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 4014 static const char *strlist[] = 4015 { 4016 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d", 4017 "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b", 4018 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a", 4019 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622", 4020 "\\u90ca\\u3588\\u009c\\u0953\\u194b", 4021 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 4022 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e", 4023 "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e", 4024 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 4025 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 4026 "\\u2027\\U000e0067\\u0a47\\u00b7", 4027 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 4028 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 4029 "\\u0589\\U000e006e\\u0a42\\U000104a5", 4030 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a", 4031 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 4032 "\\u0027\\u11af\\U000e0057\\u0602", 4033 "\\U0001d7f2\\U000e007\\u0004\\u0589", 4034 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b", 4035 "\\U0001d7f2\\U000e007d\\u0004\\u0589", 4036 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 4037 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 4038 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068", 4039 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 4040 "\\u0233\\U000e0020\\u0a69\\u0d6a", 4041 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 4042 "\\u58f4\\U000e0049\\u20e7\\u2027", 4043 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 4044 "\\ua183\\u102d\\u0bec\\u003a", 4045 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 4046 "\\u003a\\u0e57\\u0fad\\u002e", 4047 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", 4048 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", 4049 "\\U000e005d\\u2044\\u0731\\u0650\\u0061", 4050 "\\u003a\\u0664\\u00b7\\u1fba", 4051 "\\u003b\\u0027\\u00b7\\u47a3", 4052 "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b", 4053 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673", 4054 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c", 4055 }; 4056 int loop; 4057 if (U_FAILURE(status)) { 4058 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 4059 return; 4060 } 4061 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 4062 // printf("looping %d\n", loop); 4063 UnicodeString ustr = CharsToUnicodeString(strlist[loop]); 4064 // RBBICharMonkey monkey; 4065 RBBIWordMonkey monkey; 4066 4067 int expected[50]; 4068 int expectedcount = 0; 4069 4070 monkey.setText(ustr); 4071 int i; 4072 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 4073 expected[expectedcount ++] = i; 4074 } 4075 4076 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 4077 } 4078 delete bi; 4079 #endif 4080 } 4081 4082 void RBBITest::TestWordBoundary(void) 4083 { 4084 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data> 4085 Locale locale("en"); 4086 UErrorCode status = U_ZERO_ERROR; 4087 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 4088 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 4089 UChar str[50]; 4090 static const char *strlist[] = 4091 { 4092 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 4093 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 4094 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 4095 "\\u2027\\U000e0067\\u0a47\\u00b7", 4096 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 4097 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 4098 "\\u0589\\U000e006e\\u0a42\\U000104a5", 4099 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a", 4100 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 4101 "\\u0027\\u11af\\U000e0057\\u0602", 4102 "\\U0001d7f2\\U000e007\\u0004\\u0589", 4103 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b", 4104 "\\U0001d7f2\\U000e007d\\u0004\\u0589", 4105 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 4106 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 4107 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068", 4108 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 4109 "\\u0233\\U000e0020\\u0a69\\u0d6a", 4110 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 4111 "\\u58f4\\U000e0049\\u20e7\\u2027", 4112 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 4113 "\\ua183\\u102d\\u0bec\\u003a", 4114 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 4115 "\\u003a\\u0e57\\u0fad\\u002e", 4116 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", 4117 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", 4118 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019", 4119 "\\u003a\\u0664\\u00b7\\u1fba", 4120 "\\u003b\\u0027\\u00b7\\u47a3", 4121 }; 4122 int loop; 4123 if (U_FAILURE(status)) { 4124 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 4125 return; 4126 } 4127 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 4128 // printf("looping %d\n", loop); 4129 u_unescape(strlist[loop], str, 20); 4130 UnicodeString ustr(str); 4131 int forward[50]; 4132 int count = 0; 4133 4134 bi->setText(ustr); 4135 int prev = 0; 4136 int i; 4137 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) { 4138 forward[count ++] = i; 4139 if (i > prev) { 4140 int j; 4141 for (j = prev + 1; j < i; j ++) { 4142 if (bi->isBoundary(j)) { 4143 printStringBreaks(ustr, forward, count); 4144 errln("happy boundary test failed: expected %d not a boundary", 4145 j); 4146 return; 4147 } 4148 } 4149 } 4150 if (!bi->isBoundary(i)) { 4151 printStringBreaks(ustr, forward, count); 4152 errln("happy boundary test failed: expected %d a boundary", 4153 i); 4154 return; 4155 } 4156 prev = i; 4157 } 4158 } 4159 delete bi; 4160 } 4161 4162 void RBBITest::TestLineBreaks(void) 4163 { 4164 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 4165 Locale locale("en"); 4166 UErrorCode status = U_ZERO_ERROR; 4167 BreakIterator *bi = BreakIterator::createLineInstance(locale, status); 4168 const int32_t STRSIZE = 50; 4169 UChar str[STRSIZE]; 4170 static const char *strlist[] = 4171 { 4172 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc", 4173 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\" 4174 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d", 4175 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\" 4176 "u2014\\U000e0105\\u118c\\u000a\\u07f8", 4177 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f", 4178 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", 4179 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4", 4180 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", 4181 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060", 4182 "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5", 4183 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f", 4184 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1", 4185 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5", 4186 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0", 4187 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc", 4188 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f", 4189 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f", 4190 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b", 4191 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085", 4192 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac", 4193 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9", 4194 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025", 4195 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763", 4196 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029", 4197 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7", 4198 "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc", 4199 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a", 4200 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945", 4201 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014", 4202 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b", 4203 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0", 4204 "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025", 4205 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d", 4206 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111", 4207 "\\u2014\\u0020\\u000a\\u17c5\\u24fc", 4208 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f", 4209 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010", 4210 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43", 4211 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb", 4212 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc", 4213 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060", 4214 "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d" 4215 "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5" 4216 "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b", 4217 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0", 4218 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07", 4219 }; 4220 int loop; 4221 TEST_ASSERT_SUCCESS(status); 4222 if (U_FAILURE(status)) { 4223 return; 4224 } 4225 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 4226 // printf("looping %d\n", loop); 4227 int32_t t = u_unescape(strlist[loop], str, STRSIZE); 4228 if (t >= STRSIZE) { 4229 TEST_ASSERT(FALSE); 4230 continue; 4231 } 4232 4233 4234 UnicodeString ustr(str); 4235 RBBILineMonkey monkey; 4236 if (U_FAILURE(monkey.deferredStatus)) { 4237 continue; 4238 } 4239 4240 const int EXPECTEDSIZE = 50; 4241 int expected[EXPECTEDSIZE]; 4242 int expectedcount = 0; 4243 4244 monkey.setText(ustr); 4245 int i; 4246 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 4247 if (expectedcount >= EXPECTEDSIZE) { 4248 TEST_ASSERT(expectedcount < EXPECTEDSIZE); 4249 return; 4250 } 4251 expected[expectedcount ++] = i; 4252 } 4253 4254 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 4255 } 4256 delete bi; 4257 #endif 4258 } 4259 4260 void RBBITest::TestSentBreaks(void) 4261 { 4262 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 4263 Locale locale("en"); 4264 UErrorCode status = U_ZERO_ERROR; 4265 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status); 4266 UChar str[200]; 4267 static const char *strlist[] = 4268 { 4269 "Now\ris\nthe\r\ntime\n\rfor\r\r", 4270 "This\n", 4271 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.", 4272 "\"Sentence ending with a quote.\" Bye.", 4273 " (This is it). Testing the sentence iterator. \"This isn't it.\"", 4274 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"", 4275 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ", 4276 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ", 4277 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ", 4278 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!", 4279 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52" 4280 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a" 4281 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f" 4282 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352", 4283 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171" 4284 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030" 4285 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b" 4286 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b" 4287 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05" 4288 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4" 4289 }; 4290 int loop; 4291 if (U_FAILURE(status)) { 4292 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 4293 return; 4294 } 4295 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 4296 u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0]))); 4297 UnicodeString ustr(str); 4298 4299 RBBISentMonkey monkey; 4300 if (U_FAILURE(monkey.deferredStatus)) { 4301 continue; 4302 } 4303 4304 const int EXPECTEDSIZE = 50; 4305 int expected[EXPECTEDSIZE]; 4306 int expectedcount = 0; 4307 4308 monkey.setText(ustr); 4309 int i; 4310 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 4311 if (expectedcount >= EXPECTEDSIZE) { 4312 TEST_ASSERT(expectedcount < EXPECTEDSIZE); 4313 return; 4314 } 4315 expected[expectedcount ++] = i; 4316 } 4317 4318 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 4319 } 4320 delete bi; 4321 #endif 4322 } 4323 4324 void RBBITest::TestMonkey(char *params) { 4325 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 4326 4327 UErrorCode status = U_ZERO_ERROR; 4328 int32_t loopCount = 500; 4329 int32_t seed = 1; 4330 UnicodeString breakType = "all"; 4331 Locale locale("en"); 4332 UBool useUText = FALSE; 4333 4334 if (quick == FALSE) { 4335 loopCount = 10000; 4336 } 4337 4338 if (params) { 4339 UnicodeString p(params); 4340 loopCount = getIntParam("loop", p, loopCount); 4341 seed = getIntParam("seed", p, seed); 4342 4343 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status); 4344 if (m.find()) { 4345 breakType = m.group(1, status); 4346 m.reset(); 4347 p = m.replaceFirst("", status); 4348 } 4349 4350 RegexMatcher u(" *utext", p, 0, status); 4351 if (u.find()) { 4352 useUText = TRUE; 4353 u.reset(); 4354 p = u.replaceFirst("", status); 4355 } 4356 4357 4358 // m.reset(p); 4359 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) { 4360 // Each option is stripped out of the option string as it is processed. 4361 // All options have been checked. The option string should have been completely emptied.. 4362 char buf[100]; 4363 p.extract(buf, sizeof(buf), NULL, status); 4364 buf[sizeof(buf)-1] = 0; 4365 errln("Unrecognized or extra parameter: %s\n", buf); 4366 return; 4367 } 4368 4369 } 4370 4371 if (breakType == "char" || breakType == "all") { 4372 RBBICharMonkey m; 4373 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 4374 if (U_SUCCESS(status)) { 4375 RunMonkey(bi, m, "char", seed, loopCount, useUText); 4376 if (breakType == "all" && useUText==FALSE) { 4377 // Also run a quick test with UText when "all" is specified 4378 RunMonkey(bi, m, "char", seed, loopCount, TRUE); 4379 } 4380 } 4381 else { 4382 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status)); 4383 } 4384 delete bi; 4385 } 4386 4387 if (breakType == "word" || breakType == "all") { 4388 logln("Word Break Monkey Test"); 4389 RBBIWordMonkey m; 4390 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 4391 if (U_SUCCESS(status)) { 4392 RunMonkey(bi, m, "word", seed, loopCount, useUText); 4393 } 4394 else { 4395 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status)); 4396 } 4397 delete bi; 4398 } 4399 4400 if (breakType == "line" || breakType == "all") { 4401 logln("Line Break Monkey Test"); 4402 RBBILineMonkey m; 4403 BreakIterator *bi = BreakIterator::createLineInstance(locale, status); 4404 if (loopCount >= 10) { 4405 loopCount = loopCount / 5; // Line break runs slower than the others. 4406 } 4407 if (U_SUCCESS(status)) { 4408 RunMonkey(bi, m, "line", seed, loopCount, useUText); 4409 } 4410 else { 4411 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); 4412 } 4413 delete bi; 4414 } 4415 4416 if (breakType == "sent" || breakType == "all" ) { 4417 logln("Sentence Break Monkey Test"); 4418 RBBISentMonkey m; 4419 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status); 4420 if (loopCount >= 10) { 4421 loopCount = loopCount / 10; // Sentence runs slower than the other break types 4422 } 4423 if (U_SUCCESS(status)) { 4424 RunMonkey(bi, m, "sentence", seed, loopCount, useUText); 4425 } 4426 else { 4427 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); 4428 } 4429 delete bi; 4430 } 4431 4432 #endif 4433 } 4434 4435 // 4436 // Run a RBBI monkey test. Common routine, for all break iterator types. 4437 // Parameters: 4438 // bi - the break iterator to use 4439 // mk - MonkeyKind, abstraction for obtaining expected results 4440 // name - Name of test (char, word, etc.) for use in error messages 4441 // seed - Seed for starting random number generator (parameter from user) 4442 // numIterations 4443 // 4444 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed, 4445 int32_t numIterations, UBool useUText) { 4446 4447 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 4448 4449 const int32_t TESTSTRINGLEN = 500; 4450 UnicodeString testText; 4451 int32_t numCharClasses; 4452 UVector *chClasses; 4453 int expected[TESTSTRINGLEN*2 + 1]; 4454 int expectedCount = 0; 4455 char expectedBreaks[TESTSTRINGLEN*2 + 1]; 4456 char forwardBreaks[TESTSTRINGLEN*2 + 1]; 4457 char reverseBreaks[TESTSTRINGLEN*2+1]; 4458 char isBoundaryBreaks[TESTSTRINGLEN*2+1]; 4459 char followingBreaks[TESTSTRINGLEN*2+1]; 4460 char precedingBreaks[TESTSTRINGLEN*2+1]; 4461 int i; 4462 int loopCount = 0; 4463 4464 m_seed = seed; 4465 4466 numCharClasses = mk.charClasses()->size(); 4467 chClasses = mk.charClasses(); 4468 4469 // Check for errors that occured during the construction of the MonkeyKind object. 4470 // Can't report them where they occured because errln() is a method coming from intlTest, 4471 // and is not visible outside of RBBITest :-( 4472 if (U_FAILURE(mk.deferredStatus)) { 4473 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus)); 4474 return; 4475 } 4476 4477 // Verify that the character classes all have at least one member. 4478 for (i=0; i<numCharClasses; i++) { 4479 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i); 4480 if (s == NULL || s->size() == 0) { 4481 errln("Character Class #%d is null or of zero size.", i); 4482 return; 4483 } 4484 } 4485 4486 while (loopCount < numIterations || numIterations == -1) { 4487 if (numIterations == -1 && loopCount % 10 == 0) { 4488 // If test is running in an infinite loop, display a periodic tic so 4489 // we can tell that it is making progress. 4490 fprintf(stderr, "."); 4491 } 4492 // Save current random number seed, so that we can recreate the random numbers 4493 // for this loop iteration in event of an error. 4494 seed = m_seed; 4495 4496 // Populate a test string with data. 4497 testText.truncate(0); 4498 for (i=0; i<TESTSTRINGLEN; i++) { 4499 int32_t aClassNum = m_rand() % numCharClasses; 4500 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum); 4501 int32_t charIdx = m_rand() % classSet->size(); 4502 UChar32 c = classSet->charAt(charIdx); 4503 if (c < 0) { // TODO: deal with sets containing strings. 4504 errln("c < 0"); 4505 break; 4506 } 4507 testText.append(c); 4508 } 4509 4510 // Calculate the expected results for this test string. 4511 mk.setText(testText); 4512 memset(expectedBreaks, 0, sizeof(expectedBreaks)); 4513 expectedBreaks[0] = 1; 4514 int32_t breakPos = 0; 4515 expectedCount = 0; 4516 for (;;) { 4517 breakPos = mk.next(breakPos); 4518 if (breakPos == -1) { 4519 break; 4520 } 4521 if (breakPos > testText.length()) { 4522 errln("breakPos > testText.length()"); 4523 } 4524 expectedBreaks[breakPos] = 1; 4525 U_ASSERT(expectedCount<testText.length()); 4526 expected[expectedCount ++] = breakPos; 4527 } 4528 4529 // Find the break positions using forward iteration 4530 memset(forwardBreaks, 0, sizeof(forwardBreaks)); 4531 if (useUText) { 4532 UErrorCode status = U_ZERO_ERROR; 4533 UText *testUText = utext_openReplaceable(NULL, &testText, &status); 4534 // testUText = utext_openUnicodeString(testUText, &testText, &status); 4535 bi->setText(testUText, status); 4536 TEST_ASSERT_SUCCESS(status); 4537 utext_close(testUText); // The break iterator does a shallow clone of the UText 4538 // This UText can be closed immediately, so long as the 4539 // testText string continues to exist. 4540 } else { 4541 bi->setText(testText); 4542 } 4543 4544 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) { 4545 if (i < 0 || i > testText.length()) { 4546 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name); 4547 break; 4548 } 4549 forwardBreaks[i] = 1; 4550 } 4551 4552 // Find the break positions using reverse iteration 4553 memset(reverseBreaks, 0, sizeof(reverseBreaks)); 4554 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) { 4555 if (i < 0 || i > testText.length()) { 4556 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name); 4557 break; 4558 } 4559 reverseBreaks[i] = 1; 4560 } 4561 4562 // Find the break positions using isBoundary() tests. 4563 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks)); 4564 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length()); 4565 for (i=0; i<=testText.length(); i++) { 4566 isBoundaryBreaks[i] = bi->isBoundary(i); 4567 } 4568 4569 4570 // Find the break positions using the following() function. 4571 // printf("."); 4572 memset(followingBreaks, 0, sizeof(followingBreaks)); 4573 int32_t lastBreakPos = 0; 4574 followingBreaks[0] = 1; 4575 for (i=0; i<testText.length(); i++) { 4576 breakPos = bi->following(i); 4577 if (breakPos <= i || 4578 breakPos < lastBreakPos || 4579 breakPos > testText.length() || 4580 (breakPos > lastBreakPos && lastBreakPos > i)) { 4581 errln("%s break monkey test: " 4582 "Out of range value returned by BreakIterator::following().\n" 4583 "Random seed=%d index=%d; following returned %d; lastbreak=%d", 4584 name, seed, i, breakPos, lastBreakPos); 4585 break; 4586 } 4587 followingBreaks[breakPos] = 1; 4588 lastBreakPos = breakPos; 4589 } 4590 4591 // Find the break positions using the preceding() function. 4592 memset(precedingBreaks, 0, sizeof(precedingBreaks)); 4593 lastBreakPos = testText.length(); 4594 precedingBreaks[testText.length()] = 1; 4595 for (i=testText.length(); i>0; i--) { 4596 breakPos = bi->preceding(i); 4597 if (breakPos >= i || 4598 breakPos > lastBreakPos || 4599 (breakPos < 0 && testText.getChar32Start(i)>0) || 4600 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) { 4601 errln("%s break monkey test: " 4602 "Out of range value returned by BreakIterator::preceding().\n" 4603 "index=%d; prev returned %d; lastBreak=%d" , 4604 name, i, breakPos, lastBreakPos); 4605 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) { 4606 precedingBreaks[i] = 2; // Forces an error. 4607 } 4608 } else { 4609 if (breakPos >= 0) { 4610 precedingBreaks[breakPos] = 1; 4611 } 4612 lastBreakPos = breakPos; 4613 } 4614 } 4615 4616 // Compare the expected and actual results. 4617 for (i=0; i<=testText.length(); i++) { 4618 const char *errorType = NULL; 4619 if (forwardBreaks[i] != expectedBreaks[i]) { 4620 errorType = "next()"; 4621 } else if (reverseBreaks[i] != forwardBreaks[i]) { 4622 errorType = "previous()"; 4623 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) { 4624 errorType = "isBoundary()"; 4625 } else if (followingBreaks[i] != expectedBreaks[i]) { 4626 errorType = "following()"; 4627 } else if (precedingBreaks[i] != expectedBreaks[i]) { 4628 errorType = "preceding()"; 4629 } 4630 4631 4632 if (errorType != NULL) { 4633 // Format a range of the test text that includes the failure as 4634 // a data item that can be included in the rbbi test data file. 4635 4636 // Start of the range is the last point where expected and actual results 4637 // both agreed that there was a break position. 4638 int startContext = i; 4639 int32_t count = 0; 4640 for (;;) { 4641 if (startContext==0) { break; } 4642 startContext --; 4643 if (expectedBreaks[startContext] != 0) { 4644 if (count == 2) break; 4645 count ++; 4646 } 4647 } 4648 4649 // End of range is two expected breaks past the start position. 4650 int endContext = i + 1; 4651 int ci; 4652 for (ci=0; ci<2; ci++) { // Number of items to include in error text. 4653 for (;;) { 4654 if (endContext >= testText.length()) {break;} 4655 if (expectedBreaks[endContext-1] != 0) { 4656 if (count == 0) break; 4657 count --; 4658 } 4659 endContext ++; 4660 } 4661 } 4662 4663 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>" 4664 UnicodeString errorText = "<data>"; 4665 /***if (strcmp(errorType, "next()") == 0) { 4666 startContext = 0; 4667 endContext = testText.length(); 4668 4669 printStringBreaks(testText, expected, expectedCount); 4670 }***/ 4671 4672 for (ci=startContext; ci<endContext;) { 4673 UnicodeString hexChars("0123456789abcdef"); 4674 UChar32 c; 4675 int bn; 4676 c = testText.char32At(ci); 4677 if (ci == i) { 4678 // This is the location of the error. 4679 errorText.append("<?>"); 4680 } else if (expectedBreaks[ci] != 0) { 4681 // This a non-error expected break position. 4682 errorText.append("\\"); 4683 } 4684 if (c < 0x10000) { 4685 errorText.append("\\u"); 4686 for (bn=12; bn>=0; bn-=4) { 4687 errorText.append(hexChars.charAt((c>>bn)&0xf)); 4688 } 4689 } else { 4690 errorText.append("\\U"); 4691 for (bn=28; bn>=0; bn-=4) { 4692 errorText.append(hexChars.charAt((c>>bn)&0xf)); 4693 } 4694 } 4695 ci = testText.moveIndex32(ci, 1); 4696 } 4697 errorText.append("\\"); 4698 errorText.append("</data>\n"); 4699 4700 // Output the error 4701 char charErrorTxt[500]; 4702 UErrorCode status = U_ZERO_ERROR; 4703 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status); 4704 charErrorTxt[sizeof(charErrorTxt)-1] = 0; 4705 errln("%s break monkey test error. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s", 4706 name, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"), 4707 errorType, seed, i, charErrorTxt); 4708 break; 4709 } 4710 } 4711 4712 loopCount++; 4713 } 4714 #endif 4715 } 4716 4717 4718 // Bug 5532. UTF-8 based UText fails in dictionary code. 4719 // This test checks the initial patch, 4720 // which is to just keep it from crashing. Correct word boundaries 4721 // await a proper fix to the dictionary code. 4722 // 4723 void RBBITest::TestBug5532(void) { 4724 // Text includes a mixture of Thai and Latin. 4725 const unsigned char utf8Data[] = { 4726 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u, 4727 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u, 4728 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u, 4729 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 4730 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u, 4731 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u, 4732 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu, 4733 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u, 4734 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 4735 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u, 4736 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00}; 4737 4738 UErrorCode status = U_ZERO_ERROR; 4739 UText utext=UTEXT_INITIALIZER; 4740 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status); 4741 TEST_ASSERT_SUCCESS(status); 4742 4743 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status); 4744 TEST_ASSERT_SUCCESS(status); 4745 if (U_SUCCESS(status)) { 4746 bi->setText(&utext, status); 4747 TEST_ASSERT_SUCCESS(status); 4748 4749 int32_t breakCount = 0; 4750 int32_t previousBreak = -1; 4751 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) { 4752 // For now, just make sure that the break iterator doesn't hang. 4753 TEST_ASSERT(previousBreak < bi->current()); 4754 previousBreak = bi->current(); 4755 } 4756 TEST_ASSERT(breakCount > 0); 4757 } 4758 delete bi; 4759 utext_close(&utext); 4760 } 4761 4762 4763 // 4764 // TestDebug - A place-holder test for debugging purposes. 4765 // For putting in fragments of other tests that can be invoked 4766 // for tracing without a lot of unwanted extra stuff happening. 4767 // 4768 void RBBITest::TestDebug(void) { 4769 #if 0 4770 UErrorCode status = U_ZERO_ERROR; 4771 int pos = 0; 4772 int ruleStatus = 0; 4773 4774 RuleBasedBreakIterator* bi = 4775 // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); 4776 // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status); 4777 (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status); 4778 UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e"); 4779 // UnicodeString s("Aaa. Bcd"); 4780 s = s.unescape(); 4781 bi->setText(s); 4782 UBool r = bi->isBoundary(8); 4783 printf("%s", r?"true":"false"); 4784 return; 4785 pos = bi->last(); 4786 do { 4787 // ruleStatus = bi->getRuleStatus(); 4788 printf("%d\t%d\n", pos, ruleStatus); 4789 pos = bi->previous(); 4790 } while (pos != BreakIterator::DONE); 4791 #endif 4792 } 4793 4794 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 4795