1 /******************************************************************** 2 * COPYRIGHT: 3 * Copyright (c) 1999-2010, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ********************************************************************/ 6 /************************************************************************ 7 * Date Name Description 8 * 12/15/99 Madhu Creation. 9 * 01/12/2000 Madhu Updated for changed API and added new tests 10 ************************************************************************/ 11 12 #include "unicode/utypeinfo.h" // for 'typeid' to work 13 14 #include "unicode/utypes.h" 15 16 #if !UCONFIG_NO_BREAK_ITERATION 17 18 #include "unicode/utypes.h" 19 #include "unicode/brkiter.h" 20 #include "unicode/rbbi.h" 21 #include "unicode/uchar.h" 22 #include "unicode/utf16.h" 23 #include "unicode/ucnv.h" 24 #include "unicode/schriter.h" 25 #include "unicode/uniset.h" 26 #include "unicode/regex.h" // TODO: make conditional on regexp being built. 27 #include "unicode/ustring.h" 28 #include "unicode/utext.h" 29 #include "intltest.h" 30 #include "rbbitst.h" 31 #include <string.h> 32 #include "uvector.h" 33 #include "uvectr32.h" 34 #include "triedict.h" 35 #include <string.h> 36 #include <stdio.h> 37 #include <stdlib.h> 38 #include "unicode/numfmt.h" 39 #include "unicode/uscript.h" 40 41 #define TEST_ASSERT(x) {if (!(x)) { \ 42 errln("Failure in file %s, line %d", __FILE__, __LINE__);}} 43 44 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \ 45 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}} 46 47 48 //--------------------------------------------- 49 // runIndexedTest 50 //--------------------------------------------- 51 52 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params ) 53 { 54 if (exec) logln("TestSuite RuleBasedBreakIterator: "); 55 56 switch (index) { 57 #if !UCONFIG_NO_FILE_IO 58 case 0: name = "TestBug4153072"; 59 if(exec) TestBug4153072(); break; 60 #else 61 case 0: name = "skip"; 62 break; 63 #endif 64 65 case 1: name = "TestJapaneseLineBreak"; 66 if(exec) TestJapaneseLineBreak(); break; 67 case 2: name = "TestStatusReturn"; 68 if(exec) TestStatusReturn(); break; 69 70 #if !UCONFIG_NO_FILE_IO 71 case 3: name = "TestUnicodeFiles"; 72 if(exec) TestUnicodeFiles(); break; 73 case 4: name = "TestEmptyString"; 74 if(exec) TestEmptyString(); break; 75 #else 76 case 3: case 4: name = "skip"; 77 break; 78 #endif 79 80 case 5: name = "TestGetAvailableLocales"; 81 if(exec) TestGetAvailableLocales(); break; 82 83 case 6: name = "TestGetDisplayName"; 84 if(exec) TestGetDisplayName(); break; 85 86 #if !UCONFIG_NO_FILE_IO 87 case 7: name = "TestEndBehaviour"; 88 if(exec) TestEndBehaviour(); break; 89 case 8: name = "TestMixedThaiLineBreak"; 90 if(exec) TestMixedThaiLineBreak(); break; 91 case 9: name = "TestThaiLineBreak"; 92 if(exec) TestThaiLineBreak(); break; 93 case 10: name = "TestMaiyamok"; 94 if(exec) TestMaiyamok(); break; 95 case 11: name = "TestWordBreaks"; 96 if(exec) TestWordBreaks(); break; 97 case 12: name = "TestWordBoundary"; 98 if(exec) TestWordBoundary(); break; 99 case 13: name = "TestLineBreaks"; 100 if(exec) TestLineBreaks(); break; 101 case 14: name = "TestSentBreaks"; 102 if(exec) TestSentBreaks(); break; 103 case 15: name = "TestExtended"; 104 if(exec) TestExtended(); break; 105 #else 106 case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip"; 107 break; 108 #endif 109 110 case 16: 111 if(exec) { 112 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO 113 name = "TestMonkey"; 114 TestMonkey(params); 115 #else 116 name = "skip"; 117 #endif 118 } 119 break; 120 121 #if !UCONFIG_NO_FILE_IO 122 case 17: name = "TestBug3818"; 123 if(exec) TestBug3818(); break; 124 case 18: name = "TestJapaneseWordBreak"; 125 if(exec) TestJapaneseWordBreak(); break; 126 #else 127 case 17: case 18: name = "skip"; 128 break; 129 #endif 130 131 case 19: name = "TestDebug"; 132 if(exec) TestDebug(); break; 133 case 20: name = "TestTrieDict"; 134 if(exec) TestTrieDict(); break; 135 136 #if !UCONFIG_NO_FILE_IO 137 case 21: name = "TestBug5775"; 138 if (exec) TestBug5775(); break; 139 case 22: name = "TestThaiBreaks"; 140 if (exec) TestThaiBreaks(); break; 141 case 23: name = "TestTailoredBreaks"; 142 if (exec) TestTailoredBreaks(); break; 143 case 24: name = "TestTrieDictWithValue"; 144 if(exec) TestTrieDictWithValue(); break; 145 #else 146 case 21: case 22: case 23: case 24: name = "skip"; 147 break; 148 #endif 149 case 25: name = "TestDictRules"; 150 if (exec) TestDictRules(); break; 151 case 25: name = "TestBug5532"; 152 if (exec) TestBug5532(); break; 153 default: name = ""; break; //needed to end loop 154 } 155 } 156 157 158 //--------------------------------------------------------------------------- 159 // 160 // class BITestData Holds a set of Break iterator test data and results 161 // Includes 162 // - the string data to be broken 163 // - a vector of the expected break positions. 164 // - a vector of source line numbers for the data, 165 // (to help see where errors occured.) 166 // - The expected break tag values. 167 // - Vectors of actual break positions and tag values. 168 // - Functions for comparing actual with expected and 169 // reporting errors. 170 // 171 //---------------------------------------------------------------------------- 172 class BITestData { 173 public: 174 UnicodeString fDataToBreak; 175 UVector fExpectedBreakPositions; 176 UVector fExpectedTags; 177 UVector fLineNum; 178 UVector fActualBreakPositions; // Test Results. 179 UVector fActualTags; 180 181 BITestData(UErrorCode &status); 182 void addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status); 183 void checkResults(const char *heading, RBBITest *test); 184 void err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx); 185 void clearResults(); 186 }; 187 188 // 189 // Constructor. 190 // 191 BITestData::BITestData(UErrorCode &status) 192 : fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fActualBreakPositions(status), 193 fActualTags(status) 194 { 195 } 196 197 // 198 // addDataChunk. Add a section (non-breaking) piece if data to the test data. 199 // The macro form collects the line number, which is helpful 200 // when tracking down failures. 201 // 202 // A null data item is inserted at the start of each test's data 203 // to put the starting zero into the data list. The position saved for 204 // each non-null item is its ending position. 205 // 206 #define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status); 207 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) { 208 if (U_FAILURE(status)) {return;} 209 if (data != NULL) { 210 fDataToBreak.append(CharsToUnicodeString(data)); 211 } 212 fExpectedBreakPositions.addElement(fDataToBreak.length(), status); 213 fExpectedTags.addElement(tag, status); 214 fLineNum.addElement(lineNum, status); 215 } 216 217 218 // 219 // checkResults. Compare the actual and expected break positions, report any differences. 220 // 221 void BITestData::checkResults(const char *heading, RBBITest *test) { 222 int32_t expectedIndex = 0; 223 int32_t actualIndex = 0; 224 225 for (;;) { 226 // If we've run through both the expected and actual results vectors, we're done. 227 // break out of the loop. 228 if (expectedIndex >= fExpectedBreakPositions.size() && 229 actualIndex >= fActualBreakPositions.size()) { 230 break; 231 } 232 233 234 if (expectedIndex >= fExpectedBreakPositions.size()) { 235 err(heading, test, expectedIndex-1, actualIndex); 236 actualIndex++; 237 continue; 238 } 239 240 if (actualIndex >= fActualBreakPositions.size()) { 241 err(heading, test, expectedIndex, actualIndex-1); 242 expectedIndex++; 243 continue; 244 } 245 246 if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) { 247 err(heading, test, expectedIndex, actualIndex); 248 // Try to resync the positions of the indices, to avoid a rash of spurious erros. 249 if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) { 250 actualIndex++; 251 } else { 252 expectedIndex++; 253 } 254 continue; 255 } 256 257 if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) { 258 test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d", 259 heading, fLineNum.elementAt(expectedIndex), 260 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex)); 261 } 262 263 actualIndex++; 264 expectedIndex++; 265 } 266 } 267 268 // 269 // err - An error was found. Report it, along with information about where the 270 // incorrectly broken test data appeared in the source file. 271 // 272 void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx) 273 { 274 int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx); 275 int32_t actual = fActualBreakPositions.elementAti(actualIdx); 276 int32_t o = 0; 277 int32_t line = fLineNum.elementAti(expectedIdx); 278 if (expectedIdx > 0) { 279 // The line numbers are off by one because a premature break occurs somewhere 280 // within the previous item, rather than at the start of the current (expected) item. 281 // We want to report the offset of the unexpected break from the start of 282 // this previous item. 283 o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1); 284 } 285 if (actual < expected) { 286 test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading, o, line, actual, expected); 287 } else { 288 test->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading, line, actual, expected); 289 } 290 } 291 292 293 void BITestData::clearResults() { 294 fActualBreakPositions.removeAllElements(); 295 fActualTags.removeAllElements(); 296 } 297 298 299 //----------------------------------------------------------------------------------- 300 // 301 // Cannned Test Characters 302 // 303 //----------------------------------------------------------------------------------- 304 305 static const UChar cannedTestArray[] = { 306 0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031, 307 0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b, 308 0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2, 309 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3, 310 0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303, 311 0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000, 312 0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f, 313 0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000 314 }; 315 316 static UnicodeString* cannedTestChars = 0; 317 318 #define halfNA "\\u0928\\u094d\\u200d" 319 #define halfSA "\\u0938\\u094d\\u200d" 320 #define halfCHA "\\u091a\\u094d\\u200d" 321 #define halfKA "\\u0915\\u094d\\u200d" 322 #define deadTA "\\u0924\\u094d" 323 324 //-------------------------------------------------------------------------------------- 325 // 326 // RBBITest constructor and destructor 327 // 328 //-------------------------------------------------------------------------------------- 329 330 RBBITest::RBBITest() { 331 UnicodeString temp(cannedTestArray); 332 cannedTestChars = new UnicodeString(); 333 *cannedTestChars += (UChar)0x0000; 334 *cannedTestChars += temp; 335 } 336 337 338 RBBITest::~RBBITest() { 339 delete cannedTestChars; 340 } 341 342 343 static const int T_NUMBER = 100; 344 static const int T_LETTER = 200; 345 static const int T_H_OR_K = 300; 346 static const int T_IDEO = 400; 347 348 349 350 351 352 353 //-------------------------------------------------------------------- 354 //Testing the BreakIterator for devanagari script 355 //-------------------------------------------------------------------- 356 357 #define deadRA "\\u0930\\u094d" /*deadform RA = devanagari RA + virama*/ 358 #define deadPHA "\\u092b\\u094d" /*deadform PHA = devanagari PHA + virama*/ 359 #define deadTTHA "\\u0920\\u094d" 360 #define deadPA "\\u092a\\u094d" 361 #define deadSA "\\u0938\\u094d" 362 #define visarga "\\u0903" /*devanagari visarga looks like a english colon*/ 363 364 365 366 367 368 369 //----------------------------------------------------------------------------------- 370 // 371 // Test for status {tag} return value from break rules. 372 // TODO: a more thorough test. 373 // 374 //----------------------------------------------------------------------------------- 375 void RBBITest::TestStatusReturn() { 376 UnicodeString rulesString1("$Letters = [:L:];\n" 377 "$Numbers = [:N:];\n" 378 "$Letters+{1};\n" 379 "$Numbers+{2};\n" 380 "Help\\ {4}/me\\!;\n" 381 "[^$Letters $Numbers];\n" 382 "!.*;\n", -1, US_INV); 383 UnicodeString testString1 = "abc123..abc Help me Help me!"; 384 // 01234567890123456789012345678 385 int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1}; 386 int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1}; 387 388 UErrorCode status=U_ZERO_ERROR; 389 UParseError parseError; 390 391 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status); 392 if(U_FAILURE(status)) { 393 dataerrln("FAIL : in construction - %s", u_errorName(status)); 394 } else { 395 int32_t pos; 396 int32_t i = 0; 397 bi->setText(testString1); 398 for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) { 399 if (pos != bounds1[i]) { 400 errln("FAIL: expected break at %d, got %d\n", bounds1[i], pos); 401 break; 402 } 403 404 int tag = bi->getRuleStatus(); 405 if (tag != brkStatus[i]) { 406 errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag); 407 break; 408 } 409 i++; 410 } 411 } 412 delete bi; 413 } 414 415 416 static void printStringBreaks(UnicodeString ustr, int expected[], 417 int expectedcount) 418 { 419 UErrorCode status = U_ZERO_ERROR; 420 char name[100]; 421 printf("code alpha extend alphanum type word sent line name\n"); 422 int j; 423 for (j = 0; j < ustr.length(); j ++) { 424 if (expectedcount > 0) { 425 int k; 426 for (k = 0; k < expectedcount; k ++) { 427 if (j == expected[k]) { 428 printf("------------------------------------------------ %d\n", 429 j); 430 } 431 } 432 } 433 UChar32 c = ustr.char32At(j); 434 if (c > 0xffff) { 435 j ++; 436 } 437 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status); 438 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c, 439 u_isUAlphabetic(c), 440 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND), 441 u_isalnum(c), 442 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY, 443 u_charType(c), 444 U_SHORT_PROPERTY_NAME), 445 u_getPropertyValueName(UCHAR_WORD_BREAK, 446 u_getIntPropertyValue(c, 447 UCHAR_WORD_BREAK), 448 U_SHORT_PROPERTY_NAME), 449 u_getPropertyValueName(UCHAR_SENTENCE_BREAK, 450 u_getIntPropertyValue(c, 451 UCHAR_SENTENCE_BREAK), 452 U_SHORT_PROPERTY_NAME), 453 u_getPropertyValueName(UCHAR_LINE_BREAK, 454 u_getIntPropertyValue(c, 455 UCHAR_LINE_BREAK), 456 U_SHORT_PROPERTY_NAME), 457 name); 458 } 459 } 460 461 void RBBITest::TestThaiLineBreak() { 462 UErrorCode status = U_ZERO_ERROR; 463 BITestData thaiLineSelection(status); 464 465 // \u0e2f-- the Thai paiyannoi character-- isn't a letter. It's a symbol that 466 // represents elided letters at the end of a long word. It should be bound to 467 // the end of the word and not treated as an independent punctuation mark. 468 469 470 ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data 471 ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f", 0, status); 472 ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e30", 0, status); 473 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status); 474 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status); 475 // ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32", 0, status); 476 // ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status); 477 ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48", 0, status); 478 // the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us 479 ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e2d\\u0e01", 0, status); 480 ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32", 0, status); 481 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status); 482 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, status); 483 ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, status); 484 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status); 485 486 // the one time where the paiyannoi occurs somewhere other than at the end 487 // of a word is in the Thai abbrevation for "etc.", which both begins and 488 // ends with a paiyannoi 489 ADD_DATACHUNK(thaiLineSelection, "\\u0e2f\\u0e25\\u0e2f", 0, status); 490 ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status); 491 ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status); 492 493 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance( 494 Locale("th"), status); 495 if (U_FAILURE(status)) 496 { 497 errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestThaiLineBreak. - %s", u_errorName(status)); 498 return; 499 } 500 501 generalIteratorTest(*e, thaiLineSelection); 502 delete e; 503 } 504 505 506 507 void RBBITest::TestMixedThaiLineBreak() 508 { 509 UErrorCode status = U_ZERO_ERROR; 510 BITestData thaiLineSelection(status); 511 512 ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data 513 514 515 // @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters 516 // start 517 518 ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status); 519 ADD_DATACHUNK(thaiLineSelection, "\\u0E1E\\u0E38\\u0E17\\u0E18\\u0E28\\u0E31\\u0E01\\u0E23\\u0E32\\u0E0A ", 0, status); 520 ADD_DATACHUNK(thaiLineSelection, "2545 ", 0, status); 521 ADD_DATACHUNK(thaiLineSelection, "\\u0E40\\u0E1B\\u0E47\\u0E19", 0, status); 522 ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status); 523 ADD_DATACHUNK(thaiLineSelection, "\\u0E09\\u0E25\\u0E2D\\u0E07", 0, status); 524 ADD_DATACHUNK(thaiLineSelection, "\\u0E04\\u0E23\\u0E1A", 0, status); 525 ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E2D\\u0E1A ", 0, status); 526 ADD_DATACHUNK(thaiLineSelection, "\"\\u0E52\\u0E52\\u0E50 ", 0, status); 527 ADD_DATACHUNK(thaiLineSelection, "\\u0E1b\\u0E35\" ", 0, status); 528 ADD_DATACHUNK(thaiLineSelection, "\\u0E02\\u0E2d\\u0E07", 0, status); 529 ADD_DATACHUNK(thaiLineSelection, "\\u0E01\\u0E23\\u0E38\\u0E07", 0, status); 530 ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E31\\u0E15\\u0E19\\u0E42\\u0E01\\u0E2A\\u0E34\\u0E19\\u0E17\\u0E23\\u0E4C ", 0, status); 531 ADD_DATACHUNK(thaiLineSelection, "(\\u0E01\\u0E23\\u0E38\\u0E07\\u0E40\\u0E17\\u0E1e\\u0E2F", 0, status); 532 ADD_DATACHUNK(thaiLineSelection, "\\u0E2B\\u0E23\\u0E37\\u0E2D ", 0, status); 533 ADD_DATACHUNK(thaiLineSelection, "Bangkok)", 0, status); 534 535 // @suwit - end of changes 536 537 538 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale("th"), status); 539 if (U_FAILURE(status)) 540 { 541 errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak. - %s", u_errorName(status)); 542 return; 543 } 544 545 546 generalIteratorTest(*e, thaiLineSelection); 547 delete e; 548 } 549 550 551 void RBBITest::TestMaiyamok() 552 { 553 UErrorCode status = U_ZERO_ERROR; 554 BITestData thaiLineSelection(status); 555 ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data 556 // the Thai maiyamok character is a shorthand symbol that means "repeat the previous 557 // word". Instead of appearing as a word unto itself, however, it's kept together 558 // with the word before it 559 ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e1b\\u0e46", 0, status); 560 ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32\\u0e46", 0, status); 561 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status); 562 ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07", 0, status); 563 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e1e", 0, status); 564 ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e25\\u0e30", 0, status); 565 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35", 0, status); 566 ADD_DATACHUNK(thaiLineSelection, "\\u0e22\\u0e07", 0, status); 567 ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status); 568 569 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance( 570 Locale("th"), status); 571 572 if (U_FAILURE(status)) 573 { 574 errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMaiyamok. - %s", u_errorName(status)); 575 return; 576 } 577 generalIteratorTest(*e, thaiLineSelection); 578 delete e; 579 } 580 581 582 583 void RBBITest::TestBug3818() { 584 UErrorCode status = U_ZERO_ERROR; 585 586 // Four Thai words... 587 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 588 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 }; 589 UnicodeString thaiStr(thaiWordData); 590 591 RuleBasedBreakIterator* bi = 592 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status); 593 if (U_FAILURE(status) || bi == NULL) { 594 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status)); 595 return; 596 } 597 bi->setText(thaiStr); 598 599 int32_t startOfSecondWord = bi->following(1); 600 if (startOfSecondWord != 4) { 601 errln("Fail at file %s, line %d expected start of word at 4, got %d", 602 __FILE__, __LINE__, startOfSecondWord); 603 } 604 startOfSecondWord = bi->following(0); 605 if (startOfSecondWord != 4) { 606 errln("Fail at file %s, line %d expected start of word at 4, got %d", 607 __FILE__, __LINE__, startOfSecondWord); 608 } 609 delete bi; 610 } 611 612 613 void RBBITest::TestJapaneseWordBreak() { 614 // TODO: Rewrite this test for a dictionary-based word breaking. 615 #if 0 616 UErrorCode status = U_ZERO_ERROR; 617 BITestData japaneseWordSelection(status); 618 619 ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status); // Break at start of data 620 ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2 621 ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5 622 ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7 623 ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10 624 ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11 625 ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12 626 627 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance( 628 Locale("ja"), status); 629 if (U_FAILURE(status)) 630 { 631 errcheckln(status, "Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n"); 632 return; 633 } 634 635 generalIteratorTest(*e, japaneseWordSelection); 636 delete e; 637 #endif 638 } 639 640 void RBBITest::TestTrieDict() { 641 UErrorCode status = U_ZERO_ERROR; 642 643 // 644 // Open and read the test data file. 645 // 646 const char *testDataDirectory = IntlTest::getSourceTestData(status); 647 char testFileName[1000]; 648 if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen("riwords.txt") + 10 >= sizeof(testFileName)) { 649 errln("Can't open test data. Path too long."); 650 return; 651 } 652 strcpy(testFileName, testDataDirectory); 653 strcat(testFileName, "riwords.txt"); 654 655 // Items needing deleting at the end 656 MutableTrieDictionary *mutableDict = NULL; 657 CompactTrieDictionary *compactDict = NULL; 658 UnicodeSet *breaks = NULL; 659 UChar *testFile = NULL; 660 StringEnumeration *enumer1 = NULL; 661 StringEnumeration *enumer2 = NULL; 662 MutableTrieDictionary *mutable2 = NULL; 663 StringEnumeration *cloneEnum = NULL; 664 CompactTrieDictionary *compact2 = NULL; 665 666 667 const UnicodeString *originalWord = NULL; 668 const UnicodeString *cloneWord = NULL; 669 UChar *current; 670 UChar *word; 671 UChar uc; 672 int32_t wordLen; 673 int32_t wordCount; 674 int32_t testCount; 675 676 int len; 677 testFile = ReadAndConvertFile(testFileName, len, NULL, status); 678 if (U_FAILURE(status)) { 679 goto cleanup; /* something went wrong, error already output */ 680 } 681 682 mutableDict = new MutableTrieDictionary(0x0E1C, status); 683 if (U_FAILURE(status)) { 684 errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status)); 685 goto cleanup; 686 } 687 688 breaks = new UnicodeSet; 689 breaks->add(0x000A); // Line Feed 690 breaks->add(0x000D); // Carriage Return 691 breaks->add(0x2028); // Line Separator 692 breaks->add(0x2029); // Paragraph Separator 693 694 // Now add each non-comment line of the file as a word. 695 current = testFile; 696 word = current; 697 uc = *current++; 698 wordLen = 0; 699 wordCount = 0; 700 701 while (uc) { 702 if (uc == 0x0023) { // #comment line, skip 703 while (uc && !breaks->contains(uc)) { 704 uc = *current++; 705 } 706 } 707 else while (uc && !breaks->contains(uc)) { 708 ++wordLen; 709 uc = *current++; 710 } 711 if (wordLen > 0) { 712 mutableDict->addWord(word, wordLen, status); 713 if (U_FAILURE(status)) { 714 errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status)); 715 goto cleanup; 716 } 717 wordCount += 1; 718 } 719 720 // Find beginning of next line 721 while (uc && breaks->contains(uc)) { 722 uc = *current++; 723 } 724 word = current-1; 725 wordLen = 0; 726 } 727 728 if (wordCount < 50) { 729 errln("Word count (%d) unreasonably small\n", wordCount); 730 goto cleanup; 731 } 732 733 enumer1 = mutableDict->openWords(status); 734 if (U_FAILURE(status)) { 735 errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status)); 736 goto cleanup; 737 } 738 739 testCount = 0; 740 if (wordCount != (testCount = enumer1->count(status))) { 741 errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n", 742 testCount, wordCount, u_errorName(status)); 743 goto cleanup; 744 } 745 746 // Now compact it 747 compactDict = new CompactTrieDictionary(*mutableDict, status); 748 if (U_FAILURE(status)) { 749 errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status)); 750 goto cleanup; 751 } 752 753 enumer2 = compactDict->openWords(status); 754 if (U_FAILURE(status)) { 755 errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status)); 756 goto cleanup; 757 } 758 759 if (wordCount != (testCount = enumer2->count(status))) { 760 errln("CompactTrieDictionary word count (%d) differs from file word count (%d), with status %s\n", 761 testCount, wordCount, u_errorName(status)); 762 goto cleanup; 763 } 764 765 if (typeid(*enumer1) == typeid(*enumer2)) { 766 errln("CompactTrieEnumeration and MutableTrieEnumeration typeids are the same"); 767 } 768 delete enumer1; 769 enumer1 = NULL; 770 delete enumer2; 771 enumer2 = NULL; 772 773 // Now un-compact it 774 mutable2 = compactDict->cloneMutable(status); 775 if (U_FAILURE(status)) { 776 errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status)); 777 goto cleanup; 778 } 779 780 cloneEnum = mutable2->openWords(status); 781 if (U_FAILURE(status)) { 782 errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status)); 783 goto cleanup; 784 } 785 786 if (wordCount != (testCount = cloneEnum->count(status))) { 787 errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n", 788 testCount, wordCount, u_errorName(status)); 789 goto cleanup; 790 } 791 792 // Compact original dictionary to clone. Note that we can only compare the same kind of 793 // dictionary as the order of the enumerators is not guaranteed to be the same between 794 // different kinds 795 enumer1 = mutableDict->openWords(status); 796 if (U_FAILURE(status)) { 797 errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status)); 798 goto cleanup; 799 } 800 801 originalWord = enumer1->snext(status); 802 cloneWord = cloneEnum->snext(status); 803 while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) { 804 if (*originalWord != *cloneWord) { 805 errln("Original and cloned MutableTrieDictionary word mismatch\n"); 806 goto cleanup; 807 } 808 originalWord = enumer1->snext(status); 809 cloneWord = cloneEnum->snext(status); 810 } 811 812 if (U_FAILURE(status)) { 813 errln("Enumeration failed: %s\n", u_errorName(status)); 814 goto cleanup; 815 } 816 817 if (originalWord != cloneWord) { 818 errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n"); 819 goto cleanup; 820 } 821 822 // Test the data copying constructor for CompactTrieDict, and the data access APIs. 823 compact2 = new CompactTrieDictionary(compactDict->data(), status); 824 if (U_FAILURE(status)) { 825 errln("CompactTrieDictionary(const void *,...) failed\n"); 826 goto cleanup; 827 } 828 829 if (compact2->dataSize() == 0) { 830 errln("CompactTrieDictionary->dataSize() == 0\n"); 831 goto cleanup; 832 } 833 834 // Now count the words via the second dictionary 835 delete enumer1; 836 enumer1 = compact2->openWords(status); 837 if (U_FAILURE(status)) { 838 errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status)); 839 goto cleanup; 840 } 841 842 if (wordCount != (testCount = enumer1->count(status))) { 843 errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n", 844 testCount, wordCount, u_errorName(status)); 845 goto cleanup; 846 } 847 848 cleanup: 849 delete compactDict; 850 delete mutableDict; 851 delete breaks; 852 delete[] testFile; 853 delete enumer1; 854 delete mutable2; 855 delete cloneEnum; 856 delete compact2; 857 } 858 859 /*TODO: delete later*/ 860 inline void writeEnumerationToFile(StringEnumeration *enumer, char *filename){ 861 UErrorCode status = U_ZERO_ERROR; 862 FILE *outfile = fopen(filename,"w"); 863 UConverter *cvt = ucnv_open("UTF-8", &status); 864 if (U_FAILURE(status)) 865 return; 866 if(outfile != NULL){ 867 status = U_ZERO_ERROR; 868 const UnicodeString *word = enumer->snext(status); 869 while (word != NULL && U_SUCCESS(status)) { 870 char u8word[500]; 871 status = U_ZERO_ERROR; 872 ucnv_fromUChars(cvt, u8word, 500, word->getBuffer(), word->length(), 873 &status); 874 fprintf(outfile,"%s\n", u8word); 875 status = U_ZERO_ERROR; 876 word = enumer->snext(status); 877 } 878 fclose(outfile); 879 } 880 ucnv_close(cvt); 881 } 882 883 // A very simple helper class to streamline the buffer handling in 884 // TestTrieDictWithValue 885 template<class T, size_t N> 886 class AutoBuffer { 887 public: 888 AutoBuffer(size_t size) : buffer(stackBuffer) { 889 if (size > N) 890 buffer = new T[size]; 891 } 892 ~AutoBuffer() { 893 if (buffer != stackBuffer) 894 delete [] buffer; 895 } 896 T* elems() { 897 return buffer; 898 } 899 const T& operator[] (size_t i) const { 900 return buffer[i]; 901 } 902 T& operator[] (size_t i) { 903 return buffer[i]; 904 } 905 private: 906 T stackBuffer[N]; 907 T* buffer; 908 AutoBuffer(); 909 }; 910 911 //---------------------------------------------------------------------------- 912 // 913 // TestTrieDictWithValue Test trie dictionaries with logprob values and 914 // more than 2^16 nodes after compaction. 915 // 916 //---------------------------------------------------------------------------- 917 void RBBITest::TestTrieDictWithValue() { 918 UErrorCode status = U_ZERO_ERROR; 919 920 // 921 // Open and read the test data file. 922 // 923 const char *testDataDirectory = IntlTest::getSourceTestData(status); 924 const char *filename = "cjdict-truncated.txt"; 925 char testFileName[1000]; 926 if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen(filename) + 10 >= sizeof(testFileName)) { 927 errln("Can't open test data. Path too long."); 928 return; 929 } 930 strcpy(testFileName, testDataDirectory); 931 strcat(testFileName, filename); 932 933 // Items needing deleting at the end 934 MutableTrieDictionary *mutableDict = NULL; 935 CompactTrieDictionary *compactDict = NULL; 936 UnicodeSet *breaks = NULL; 937 UChar *testFile = NULL; 938 StringEnumeration *enumer1 = NULL; 939 StringEnumeration *enumer2 = NULL; 940 MutableTrieDictionary *mutable2 = NULL; 941 StringEnumeration *cloneEnum = NULL; 942 CompactTrieDictionary *compact2 = NULL; 943 NumberFormat *nf = NULL; 944 UText *originalText = NULL, *cloneText = NULL; 945 946 const UnicodeString *originalWord = NULL; 947 const UnicodeString *cloneWord = NULL; 948 UChar *current; 949 UChar *word; 950 UChar uc; 951 int32_t wordLen; 952 int32_t wordCount; 953 int32_t testCount; 954 int32_t valueLen; 955 int counter = 0; 956 957 int len; 958 testFile = ReadAndConvertFile(testFileName, len, NULL, status); 959 if (U_FAILURE(status)) { 960 goto cleanup; /* something went wrong, error already output */ 961 } 962 963 mutableDict = new MutableTrieDictionary(0x0E1C, status, TRUE); 964 if (U_FAILURE(status)) { 965 errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status)); 966 goto cleanup; 967 } 968 969 breaks = new UnicodeSet; 970 breaks->add(0x000A); // Line Feed 971 breaks->add(0x000D); // Carriage Return 972 breaks->add(0x2028); // Line Separator 973 breaks->add(0x2029); // Paragraph Separator 974 breaks->add(0x0009); // Tab character 975 976 // Now add each non-comment line of the file as a word. 977 current = testFile; 978 word = current; 979 uc = *current++; 980 wordLen = 0; 981 wordCount = 0; 982 nf = NumberFormat::createInstance(status); 983 984 while (uc) { 985 UnicodeString ucharValue; 986 valueLen = 0; 987 988 if (uc == 0x0023) { // #comment line, skip 989 while (uc && !breaks->contains(uc)) { 990 uc = *current++; 991 } 992 } 993 else{ 994 while (uc && !breaks->contains(uc)) { 995 ++wordLen; 996 uc = *current++; 997 } 998 if(uc == 0x0009){ //separator is a tab char, read in num after tab 999 uc = *current++; 1000 while (uc && !breaks->contains(uc)) { 1001 ucharValue.append(uc); 1002 uc = *current++; 1003 } 1004 } 1005 } 1006 if (wordLen > 0) { 1007 Formattable value((int32_t)0); 1008 nf->parse(ucharValue.getTerminatedBuffer(), value, status); 1009 1010 if(U_FAILURE(status)){ 1011 errln("parsing of value failed when reading in dictionary\n"); 1012 goto cleanup; 1013 } 1014 mutableDict->addWord(word, wordLen, status, value.getLong()); 1015 if (U_FAILURE(status)) { 1016 errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status)); 1017 goto cleanup; 1018 } 1019 wordCount += 1; 1020 } 1021 1022 // Find beginning of next line 1023 while (uc && breaks->contains(uc)) { 1024 uc = *current++; 1025 } 1026 word = current-1; 1027 wordLen = 0; 1028 } 1029 1030 if (wordCount < 50) { 1031 errln("Word count (%d) unreasonably small\n", wordCount); 1032 goto cleanup; 1033 } 1034 1035 enumer1 = mutableDict->openWords(status); 1036 if (U_FAILURE(status)) { 1037 errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status)); 1038 goto cleanup; 1039 } 1040 1041 testCount = 0; 1042 if (wordCount != (testCount = enumer1->count(status))) { 1043 errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n", 1044 testCount, wordCount, u_errorName(status)); 1045 goto cleanup; 1046 } 1047 1048 // Now compact it 1049 compactDict = new CompactTrieDictionary(*mutableDict, status); 1050 if (U_FAILURE(status)) { 1051 errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status)); 1052 goto cleanup; 1053 } 1054 1055 enumer2 = compactDict->openWords(status); 1056 if (U_FAILURE(status)) { 1057 errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status)); 1058 goto cleanup; 1059 } 1060 1061 1062 //delete later 1063 // writeEnumerationToFile(enumer1, "/home/jchye/mutable.txt"); 1064 // writeEnumerationToFile(enumer2, "/home/jchye/compact.txt"); 1065 1066 enumer1->reset(status); 1067 enumer2->reset(status); 1068 1069 originalWord = enumer1->snext(status); 1070 cloneWord = enumer2->snext(status); 1071 while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) { 1072 if (*originalWord != *cloneWord) { 1073 errln("MutableTrieDictionary and CompactTrieDictionary word mismatch at %d, lengths are %d and %d\n", 1074 counter, originalWord->length(), cloneWord->length()); 1075 goto cleanup; 1076 } 1077 1078 // check if attached values of the same word in both dictionaries tally 1079 #if 0 1080 int32_t lengths1[originalWord->length()], lengths2[cloneWord->length()]; 1081 uint16_t values1[originalWord->length()], values2[cloneWord->length()]; 1082 #endif 1083 AutoBuffer<int32_t, 20> lengths1(originalWord->length()); 1084 AutoBuffer<int32_t, 20> lengths2(cloneWord->length()); 1085 AutoBuffer<uint16_t, 20> values1(originalWord->length()); 1086 AutoBuffer<uint16_t, 20> values2(cloneWord->length()); 1087 1088 originalText = utext_openConstUnicodeString(originalText, originalWord, &status); 1089 cloneText = utext_openConstUnicodeString(cloneText, cloneWord, &status); 1090 1091 int count1, count2; 1092 mutableDict->matches(originalText, originalWord->length(), lengths1.elems(), count1, originalWord->length(), values1.elems()); 1093 compactDict->matches(cloneText, cloneWord->length(), lengths2.elems(), count2, cloneWord->length(), values2.elems()); 1094 1095 if(values1[count1-1] != values2[count2-1]){ 1096 errln("Values of word %d in MutableTrieDictionary and CompactTrieDictionary do not match, with values %d and %d\n", 1097 counter, values1[count1-1], values2[count2-1]); 1098 goto cleanup; 1099 } 1100 1101 counter++; 1102 originalWord = enumer1->snext(status); 1103 cloneWord = enumer2->snext(status); 1104 } 1105 if (enumer1->getDynamicClassID() == enumer2->getDynamicClassID()) { 1106 errln("CompactTrieEnumeration and MutableTrieEnumeration ClassIDs are the same"); 1107 } 1108 1109 delete enumer1; 1110 enumer1 = NULL; 1111 delete enumer2; 1112 enumer2 = NULL; 1113 1114 // Now un-compact it 1115 mutable2 = compactDict->cloneMutable(status); 1116 if (U_FAILURE(status)) { 1117 errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status)); 1118 goto cleanup; 1119 } 1120 1121 cloneEnum = mutable2->openWords(status); 1122 if (U_FAILURE(status)) { 1123 errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status)); 1124 goto cleanup; 1125 } 1126 1127 if (wordCount != (testCount = cloneEnum->count(status))) { 1128 errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n", 1129 testCount, wordCount, u_errorName(status)); 1130 goto cleanup; 1131 } 1132 1133 // Compact original dictionary to clone. Note that we can only compare the same kind of 1134 // dictionary as the order of the enumerators is not guaranteed to be the same between 1135 // different kinds 1136 enumer1 = mutableDict->openWords(status); 1137 if (U_FAILURE(status)) { 1138 errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status)); 1139 goto cleanup; 1140 } 1141 1142 counter = 0; 1143 originalWord = enumer1->snext(status); 1144 cloneWord = cloneEnum->snext(status); 1145 while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) { 1146 if (*originalWord != *cloneWord) { 1147 errln("Original and cloned MutableTrieDictionary word mismatch\n"); 1148 goto cleanup; 1149 } 1150 1151 // check if attached values of the same word in both dictionaries tally 1152 AutoBuffer<int32_t, 20> lengths1(originalWord->length()); 1153 AutoBuffer<int32_t, 20> lengths2(cloneWord->length()); 1154 AutoBuffer<uint16_t, 20> values1(originalWord->length()); 1155 AutoBuffer<uint16_t, 20> values2(cloneWord->length()); 1156 originalText = utext_openConstUnicodeString(originalText, originalWord, &status); 1157 cloneText = utext_openConstUnicodeString(cloneText, cloneWord, &status); 1158 1159 int count1, count2; 1160 mutableDict->matches(originalText, originalWord->length(), lengths1.elems(), count1, originalWord->length(), values1.elems()); 1161 mutable2->matches(cloneText, cloneWord->length(), lengths2.elems(), count2, cloneWord->length(), values2.elems()); 1162 1163 if(values1[count1-1] != values2[count2-1]){ 1164 errln("Values of word %d in original and cloned MutableTrieDictionary do not match, with values %d and %d\n", 1165 counter, values1[count1-1], values2[count2-1]); 1166 goto cleanup; 1167 } 1168 1169 counter++; 1170 1171 originalWord = enumer1->snext(status); 1172 cloneWord = cloneEnum->snext(status); 1173 } 1174 1175 if (U_FAILURE(status)) { 1176 errln("Enumeration failed: %s\n", u_errorName(status)); 1177 goto cleanup; 1178 } 1179 1180 if (originalWord != cloneWord) { 1181 errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n"); 1182 goto cleanup; 1183 } 1184 1185 // Test the data copying constructor for CompactTrieDict, and the data access APIs. 1186 compact2 = new CompactTrieDictionary(compactDict->data(), status); 1187 if (U_FAILURE(status)) { 1188 errln("CompactTrieDictionary(const void *,...) failed\n"); 1189 goto cleanup; 1190 } 1191 1192 if (compact2->dataSize() == 0) { 1193 errln("CompactTrieDictionary->dataSize() == 0\n"); 1194 goto cleanup; 1195 } 1196 1197 // Now count the words via the second dictionary 1198 delete enumer1; 1199 enumer1 = compact2->openWords(status); 1200 if (U_FAILURE(status)) { 1201 errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status)); 1202 goto cleanup; 1203 } 1204 1205 if (wordCount != (testCount = enumer1->count(status))) { 1206 errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n", 1207 testCount, wordCount, u_errorName(status)); 1208 goto cleanup; 1209 } 1210 1211 cleanup: 1212 delete compactDict; 1213 delete mutableDict; 1214 delete breaks; 1215 delete[] testFile; 1216 delete enumer1; 1217 delete mutable2; 1218 delete cloneEnum; 1219 delete compact2; 1220 utext_close(originalText); 1221 utext_close(cloneText); 1222 1223 1224 } 1225 1226 //---------------------------------------------------------------------------- 1227 // 1228 // generalIteratorTest Given a break iterator and a set of test data, 1229 // Run the tests and report the results. 1230 // 1231 //---------------------------------------------------------------------------- 1232 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td) 1233 { 1234 1235 bi.setText(td.fDataToBreak); 1236 1237 testFirstAndNext(bi, td); 1238 1239 testLastAndPrevious(bi, td); 1240 1241 testFollowing(bi, td); 1242 testPreceding(bi, td); 1243 testIsBoundary(bi, td); 1244 doMultipleSelectionTest(bi, td); 1245 } 1246 1247 1248 // 1249 // testFirstAndNext. Run the iterator forwards in the obvious first(), next() 1250 // kind of loop. 1251 // 1252 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td) 1253 { 1254 UErrorCode status = U_ZERO_ERROR; 1255 int32_t p; 1256 int32_t lastP = -1; 1257 int32_t tag; 1258 1259 logln("Test first and next"); 1260 bi.setText(td.fDataToBreak); 1261 td.clearResults(); 1262 1263 for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) { 1264 td.fActualBreakPositions.addElement(p, status); // Save result. 1265 tag = bi.getRuleStatus(); 1266 td.fActualTags.addElement(tag, status); 1267 if (p <= lastP) { 1268 // If the iterator is not making forward progress, stop. 1269 // No need to raise an error here, it'll be detected in the normal check of results. 1270 break; 1271 } 1272 lastP = p; 1273 } 1274 td.checkResults("testFirstAndNext", this); 1275 } 1276 1277 1278 // 1279 // TestLastAndPrevious. Run the iterator backwards, starting with last(). 1280 // 1281 void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td) 1282 { 1283 UErrorCode status = U_ZERO_ERROR; 1284 int32_t p; 1285 int32_t lastP = 0x7ffffffe; 1286 int32_t tag; 1287 1288 logln("Test last and previous"); 1289 bi.setText(td.fDataToBreak); 1290 td.clearResults(); 1291 1292 for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) { 1293 // Save break position. Insert it at start of vector of results, shoving 1294 // already-saved results further towards the end. 1295 td.fActualBreakPositions.insertElementAt(p, 0, status); 1296 // bi.previous(); // TODO: Why does this fix things up???? 1297 // bi.next(); 1298 tag = bi.getRuleStatus(); 1299 td.fActualTags.insertElementAt(tag, 0, status); 1300 if (p >= lastP) { 1301 // If the iterator is not making progress, stop. 1302 // No need to raise an error here, it'll be detected in the normal check of results. 1303 break; 1304 } 1305 lastP = p; 1306 } 1307 td.checkResults("testLastAndPrevious", this); 1308 } 1309 1310 1311 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td) 1312 { 1313 UErrorCode status = U_ZERO_ERROR; 1314 int32_t p; 1315 int32_t tag; 1316 int32_t lastP = -2; // A value that will never be returned as a break position. 1317 // cannot be -1; that is returned for DONE. 1318 int i; 1319 1320 logln("testFollowing():"); 1321 bi.setText(td.fDataToBreak); 1322 td.clearResults(); 1323 1324 // Save the starting point, since we won't get that out of following. 1325 p = bi.first(); 1326 td.fActualBreakPositions.addElement(p, status); // Save result. 1327 tag = bi.getRuleStatus(); 1328 td.fActualTags.addElement(tag, status); 1329 1330 for (i = 0; i <= td.fDataToBreak.length()+1; i++) { 1331 p = bi.following(i); 1332 if (p != lastP) { 1333 if (p == RuleBasedBreakIterator::DONE) { 1334 break; 1335 } 1336 // We've reached a new break position. Save it. 1337 td.fActualBreakPositions.addElement(p, status); // Save result. 1338 tag = bi.getRuleStatus(); 1339 td.fActualTags.addElement(tag, status); 1340 lastP = p; 1341 } 1342 } 1343 // The loop normally exits by means of the break in the middle. 1344 // Make sure that the index was at the correct position for the break iterator to have 1345 // returned DONE. 1346 if (i != td.fDataToBreak.length()) { 1347 errln("testFollowing(): iterator returned DONE prematurely."); 1348 } 1349 1350 // Full check of all results. 1351 td.checkResults("testFollowing", this); 1352 } 1353 1354 1355 1356 void RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) { 1357 UErrorCode status = U_ZERO_ERROR; 1358 int32_t p; 1359 int32_t tag; 1360 int32_t lastP = 0x7ffffffe; 1361 int i; 1362 1363 logln("testPreceding():"); 1364 bi.setText(td.fDataToBreak); 1365 td.clearResults(); 1366 1367 p = bi.last(); 1368 td.fActualBreakPositions.addElement(p, status); 1369 tag = bi.getRuleStatus(); 1370 td.fActualTags.addElement(tag, status); 1371 1372 for (i = td.fDataToBreak.length(); i>=-1; i--) { 1373 p = bi.preceding(i); 1374 if (p != lastP) { 1375 if (p == RuleBasedBreakIterator::DONE) { 1376 break; 1377 } 1378 // We've reached a new break position. Save it. 1379 td.fActualBreakPositions.insertElementAt(p, 0, status); 1380 lastP = p; 1381 tag = bi.getRuleStatus(); 1382 td.fActualTags.insertElementAt(tag, 0, status); 1383 } 1384 } 1385 // The loop normally exits by means of the break in the middle. 1386 // Make sure that the index was at the correct position for the break iterator to have 1387 // returned DONE. 1388 if (i != 0) { 1389 errln("testPreceding(): iterator returned DONE prematurely."); 1390 } 1391 1392 // Full check of all results. 1393 td.checkResults("testPreceding", this); 1394 } 1395 1396 1397 1398 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) { 1399 UErrorCode status = U_ZERO_ERROR; 1400 int i; 1401 int32_t tag; 1402 1403 logln("testIsBoundary():"); 1404 bi.setText(td.fDataToBreak); 1405 td.clearResults(); 1406 1407 for (i = 0; i <= td.fDataToBreak.length(); i++) { 1408 if (bi.isBoundary(i)) { 1409 td.fActualBreakPositions.addElement(i, status); // Save result. 1410 tag = bi.getRuleStatus(); 1411 td.fActualTags.addElement(tag, status); 1412 } 1413 } 1414 td.checkResults("testIsBoundary: ", this); 1415 } 1416 1417 1418 1419 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td) 1420 { 1421 iterator.setText(td.fDataToBreak); 1422 1423 RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone(); 1424 int32_t offset = iterator.first(); 1425 int32_t testOffset; 1426 int32_t count = 0; 1427 1428 logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length()); 1429 1430 if (*testIterator != iterator) 1431 errln("clone() or operator!= failed: two clones compared unequal"); 1432 1433 do { 1434 testOffset = testIterator->first(); 1435 testOffset = testIterator->next(count); 1436 if (offset != testOffset) 1437 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset); 1438 1439 if (offset != RuleBasedBreakIterator::DONE) { 1440 count++; 1441 offset = iterator.next(); 1442 1443 if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) { 1444 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset); 1445 if (count > 10000 || offset == -1) { 1446 errln("operator== failed too many times. Stopping test."); 1447 if (offset == -1) { 1448 errln("Does (RuleBasedBreakIterator::DONE == -1)?"); 1449 } 1450 return; 1451 } 1452 } 1453 } 1454 } while (offset != RuleBasedBreakIterator::DONE); 1455 1456 // now do it backwards... 1457 offset = iterator.last(); 1458 count = 0; 1459 1460 do { 1461 testOffset = testIterator->last(); 1462 testOffset = testIterator->next(count); // next() with a negative arg is same as previous 1463 if (offset != testOffset) 1464 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset); 1465 1466 if (offset != RuleBasedBreakIterator::DONE) { 1467 count--; 1468 offset = iterator.previous(); 1469 } 1470 } while (offset != RuleBasedBreakIterator::DONE); 1471 1472 delete testIterator; 1473 } 1474 1475 1476 //--------------------------------------------- 1477 // 1478 // other tests 1479 // 1480 //--------------------------------------------- 1481 void RBBITest::TestEmptyString() 1482 { 1483 UnicodeString text = ""; 1484 UErrorCode status = U_ZERO_ERROR; 1485 1486 BITestData x(status); 1487 ADD_DATACHUNK(x, "", 0, status); // Break at start of data 1488 RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); 1489 if (U_FAILURE(status)) 1490 { 1491 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status)); 1492 return; 1493 } 1494 generalIteratorTest(*bi, x); 1495 delete bi; 1496 } 1497 1498 void RBBITest::TestGetAvailableLocales() 1499 { 1500 int32_t locCount = 0; 1501 const Locale* locList = BreakIterator::getAvailableLocales(locCount); 1502 1503 if (locCount == 0) 1504 dataerrln("getAvailableLocales() returned an empty list!"); 1505 // Just make sure that it's returning good memory. 1506 int32_t i; 1507 for (i = 0; i < locCount; ++i) { 1508 logln(locList[i].getName()); 1509 } 1510 } 1511 1512 //Testing the BreakIterator::getDisplayName() function 1513 void RBBITest::TestGetDisplayName() 1514 { 1515 UnicodeString result; 1516 1517 BreakIterator::getDisplayName(Locale::getUS(), result); 1518 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)") 1519 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \"" 1520 + result); 1521 1522 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result); 1523 if (result != "French (France)") 1524 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \"" 1525 + result); 1526 } 1527 /** 1528 * Test End Behaviour 1529 * @bug 4068137 1530 */ 1531 void RBBITest::TestEndBehaviour() 1532 { 1533 UErrorCode status = U_ZERO_ERROR; 1534 UnicodeString testString("boo."); 1535 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status); 1536 if (U_FAILURE(status)) 1537 { 1538 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status)); 1539 return; 1540 } 1541 wb->setText(testString); 1542 1543 if (wb->first() != 0) 1544 errln("Didn't get break at beginning of string."); 1545 if (wb->next() != 3) 1546 errln("Didn't get break before period in \"boo.\""); 1547 if (wb->current() != 4 && wb->next() != 4) 1548 errln("Didn't get break at end of string."); 1549 delete wb; 1550 } 1551 /* 1552 * @bug 4153072 1553 */ 1554 void RBBITest::TestBug4153072() { 1555 UErrorCode status = U_ZERO_ERROR; 1556 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status); 1557 if (U_FAILURE(status)) 1558 { 1559 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status)); 1560 return; 1561 } 1562 UnicodeString str("...Hello, World!..."); 1563 int32_t begin = 3; 1564 int32_t end = str.length() - 3; 1565 UBool onBoundary; 1566 1567 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin); 1568 iter->adoptText(textIterator); 1569 int index; 1570 // Note: with the switch to UText, there is no way to restrict the 1571 // iteration range to begin at an index other than zero. 1572 // String character iterators created with a non-zero bound are 1573 // treated by RBBI as being empty. 1574 for (index = -1; index < begin + 1; ++index) { 1575 onBoundary = iter->isBoundary(index); 1576 if (index == 0? !onBoundary : onBoundary) { 1577 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index + 1578 " and begin index = " + begin); 1579 } 1580 } 1581 delete iter; 1582 } 1583 1584 1585 // 1586 // Test for problem reported by Ashok Matoria on 9 July 2007 1587 // One.<kSoftHyphen><kSpace>Two. 1588 // 1589 // Sentence break at start (0) and then on calling next() it breaks at 1590 // 'T' of "Two". Now, at this point if I do next() and 1591 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two". 1592 // 1593 void RBBITest::TestBug5775() { 1594 UErrorCode status = U_ZERO_ERROR; 1595 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status); 1596 TEST_ASSERT_SUCCESS(status); 1597 if (U_FAILURE(status)) { 1598 return; 1599 } 1600 // Check for status first for better handling of no data errors. 1601 TEST_ASSERT(bi != NULL); 1602 if (bi == NULL) { 1603 return; 1604 } 1605 1606 UnicodeString s("One.\\u00ad Two.", -1, US_INV); 1607 // 01234 56789 1608 s = s.unescape(); 1609 bi->setText(s); 1610 int pos = bi->next(); 1611 TEST_ASSERT(pos == 6); 1612 pos = bi->next(); 1613 TEST_ASSERT(pos == 10); 1614 pos = bi->previous(); 1615 TEST_ASSERT(pos == 6); 1616 delete bi; 1617 } 1618 1619 1620 1621 /** 1622 * Test Japanese Line Break 1623 * @bug 4095322 1624 */ 1625 void RBBITest::TestJapaneseLineBreak() 1626 { 1627 #if 0 1628 // Test needs updating some more... Dump it for now. 1629 1630 1631 // Change for Unicode TR 14: Punctuation characters with categories Pi and Pf do not count 1632 // as opening and closing punctuation for line breaking. 1633 // Also, \u30fc and \u30fe are not counted as hyphens. Remove these chars 1634 // from these tests. 6-13-2002 1635 // 1636 UErrorCode status = U_ZERO_ERROR; 1637 UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c"); 1638 UnicodeString precedingChars = CharsToUnicodeString( 1639 //"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f"); 1640 "([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e"); 1641 UnicodeString followingChars = CharsToUnicodeString( 1642 // ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc" 1643 ")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7" 1644 // ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034" 1645 ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034" 1646 "\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302"); 1647 BreakIterator *iter = BreakIterator::createLineInstance(Locale::getJapan(), status); 1648 1649 int32_t i; 1650 if (U_FAILURE(status)) 1651 { 1652 errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n"); 1653 return; 1654 } 1655 1656 for (i = 0; i < precedingChars.length(); i++) { 1657 testString.setCharAt(1, precedingChars[i]); 1658 iter->setText(testString); 1659 int32_t j = iter->first(); 1660 if (j != 0) 1661 errln("ja line break failure: failed to start at 0"); 1662 j = iter->next(); 1663 if (j != 1) 1664 errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars[i]) 1665 + "' (" + ((int)(precedingChars[i])) + ")"); 1666 j = iter->next(); 1667 if (j != 3) 1668 errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars[i]) 1669 + "' (" + ((int)(precedingChars[i])) + ")"); 1670 } 1671 1672 for (i = 0; i < followingChars.length(); i++) { 1673 testString.setCharAt(1, followingChars[i]); 1674 iter->setText(testString); 1675 int j = iter->first(); 1676 if (j != 0) 1677 errln("ja line break failure: failed to start at 0"); 1678 j = iter->next(); 1679 if (j != 2) 1680 errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars[i]) 1681 + "' (" + ((int)(followingChars[i])) + ")"); 1682 j = iter->next(); 1683 if (j != 3) 1684 errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars[i]) 1685 + "' (" + ((int)(followingChars[i])) + ")"); 1686 } 1687 delete iter; 1688 #endif 1689 } 1690 1691 1692 //------------------------------------------------------------------------------ 1693 // 1694 // RBBITest::Extended Run RBBI Tests from an external test data file 1695 // 1696 //------------------------------------------------------------------------------ 1697 1698 struct TestParams { 1699 BreakIterator *bi; 1700 UnicodeString dataToBreak; 1701 UVector32 *expectedBreaks; 1702 UVector32 *srcLine; 1703 UVector32 *srcCol; 1704 }; 1705 1706 void RBBITest::executeTest(TestParams *t) { 1707 int32_t bp; 1708 int32_t prevBP; 1709 int32_t i; 1710 1711 if (t->bi == NULL) { 1712 return; 1713 } 1714 1715 t->bi->setText(t->dataToBreak); 1716 // 1717 // Run the iterator forward 1718 // 1719 prevBP = -1; 1720 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) { 1721 if (prevBP == bp) { 1722 // Fail for lack of forward progress. 1723 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d", 1724 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 1725 break; 1726 } 1727 1728 // Check that there were we didn't miss an expected break between the last one 1729 // and this one. 1730 for (i=prevBP+1; i<bp; i++) { 1731 if (t->expectedBreaks->elementAti(i) != 0) { 1732 int expected[] = {0, i}; 1733 printStringBreaks(t->dataToBreak, expected, 2); 1734 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1735 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 1736 } 1737 } 1738 1739 // Check that the break we did find was expected 1740 if (t->expectedBreaks->elementAti(bp) == 0) { 1741 int expected[] = {0, bp}; 1742 printStringBreaks(t->dataToBreak, expected, 2); 1743 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d", 1744 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 1745 } else { 1746 // The break was expected. 1747 // Check that the {nnn} tag value is correct. 1748 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp); 1749 if (expectedTagVal == -1) { 1750 expectedTagVal = 0; 1751 } 1752 int32_t line = t->srcLine->elementAti(bp); 1753 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus(); 1754 if (rs != expectedTagVal) { 1755 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n" 1756 " Actual, Expected status = %4d, %4d", 1757 bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal); 1758 } 1759 } 1760 1761 1762 prevBP = bp; 1763 } 1764 1765 // Verify that there were no missed expected breaks after the last one found 1766 for (i=prevBP+1; i<t->expectedBreaks->size(); i++) { 1767 if (t->expectedBreaks->elementAti(i) != 0) { 1768 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1769 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 1770 } 1771 } 1772 1773 // 1774 // Run the iterator backwards, verify that the same breaks are found. 1775 // 1776 prevBP = t->dataToBreak.length()+2; // start with a phony value for the last break pos seen. 1777 for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) { 1778 if (prevBP == bp) { 1779 // Fail for lack of progress. 1780 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d", 1781 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 1782 break; 1783 } 1784 1785 // Check that there were we didn't miss an expected break between the last one 1786 // and this one. (UVector returns zeros for index out of bounds.) 1787 for (i=prevBP-1; i>bp; i--) { 1788 if (t->expectedBreaks->elementAti(i) != 0) { 1789 errln("Reverse Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1790 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 1791 } 1792 } 1793 1794 // Check that the break we did find was expected 1795 if (t->expectedBreaks->elementAti(bp) == 0) { 1796 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d", 1797 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 1798 } else { 1799 // The break was expected. 1800 // Check that the {nnn} tag value is correct. 1801 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp); 1802 if (expectedTagVal == -1) { 1803 expectedTagVal = 0; 1804 } 1805 int line = t->srcLine->elementAti(bp); 1806 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus(); 1807 if (rs != expectedTagVal) { 1808 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n" 1809 " Actual, Expected status = %4d, %4d", 1810 bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal); 1811 } 1812 } 1813 1814 prevBP = bp; 1815 } 1816 1817 // Verify that there were no missed breaks prior to the last one found 1818 for (i=prevBP-1; i>=0; i--) { 1819 if (t->expectedBreaks->elementAti(i) != 0) { 1820 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1821 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 1822 } 1823 } 1824 } 1825 1826 1827 void RBBITest::TestExtended() { 1828 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 1829 UErrorCode status = U_ZERO_ERROR; 1830 Locale locale(""); 1831 1832 UnicodeString rules; 1833 TestParams tp; 1834 tp.bi = NULL; 1835 tp.expectedBreaks = new UVector32(status); 1836 tp.srcLine = new UVector32(status); 1837 tp.srcCol = new UVector32(status); 1838 1839 RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status); 1840 if (U_FAILURE(status)) { 1841 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status)); 1842 } 1843 1844 1845 // 1846 // Open and read the test data file. 1847 // 1848 const char *testDataDirectory = IntlTest::getSourceTestData(status); 1849 char testFileName[1000]; 1850 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) { 1851 errln("Can't open test data. Path too long."); 1852 return; 1853 } 1854 strcpy(testFileName, testDataDirectory); 1855 strcat(testFileName, "rbbitst.txt"); 1856 1857 int len; 1858 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status); 1859 if (U_FAILURE(status)) { 1860 return; /* something went wrong, error already output */ 1861 } 1862 1863 1864 1865 1866 // 1867 // Put the test data into a UnicodeString 1868 // 1869 UnicodeString testString(FALSE, testFile, len); 1870 1871 enum EParseState{ 1872 PARSE_COMMENT, 1873 PARSE_TAG, 1874 PARSE_DATA, 1875 PARSE_NUM 1876 } 1877 parseState = PARSE_TAG; 1878 1879 EParseState savedState = PARSE_TAG; 1880 1881 static const UChar CH_LF = 0x0a; 1882 static const UChar CH_CR = 0x0d; 1883 static const UChar CH_HASH = 0x23; 1884 /*static const UChar CH_PERIOD = 0x2e;*/ 1885 static const UChar CH_LT = 0x3c; 1886 static const UChar CH_GT = 0x3e; 1887 static const UChar CH_BACKSLASH = 0x5c; 1888 static const UChar CH_BULLET = 0x2022; 1889 1890 int32_t lineNum = 1; 1891 int32_t colStart = 0; 1892 int32_t column = 0; 1893 int32_t charIdx = 0; 1894 1895 int32_t tagValue = 0; // The numeric value of a <nnn> tag. 1896 1897 for (charIdx = 0; charIdx < len; ) { 1898 status = U_ZERO_ERROR; 1899 UChar c = testString.charAt(charIdx); 1900 charIdx++; 1901 if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) { 1902 // treat CRLF as a unit 1903 c = CH_LF; 1904 charIdx++; 1905 } 1906 if (c == CH_LF || c == CH_CR) { 1907 lineNum++; 1908 colStart = charIdx; 1909 } 1910 column = charIdx - colStart + 1; 1911 1912 switch (parseState) { 1913 case PARSE_COMMENT: 1914 if (c == 0x0a || c == 0x0d) { 1915 parseState = savedState; 1916 } 1917 break; 1918 1919 case PARSE_TAG: 1920 { 1921 if (c == CH_HASH) { 1922 parseState = PARSE_COMMENT; 1923 savedState = PARSE_TAG; 1924 break; 1925 } 1926 if (u_isUWhiteSpace(c)) { 1927 break; 1928 } 1929 if (testString.compare(charIdx-1, 6, "<word>") == 0) { 1930 delete tp.bi; 1931 tp.bi = BreakIterator::createWordInstance(locale, status); 1932 charIdx += 5; 1933 break; 1934 } 1935 if (testString.compare(charIdx-1, 6, "<char>") == 0) { 1936 delete tp.bi; 1937 tp.bi = BreakIterator::createCharacterInstance(locale, status); 1938 charIdx += 5; 1939 break; 1940 } 1941 if (testString.compare(charIdx-1, 6, "<line>") == 0) { 1942 delete tp.bi; 1943 tp.bi = BreakIterator::createLineInstance(locale, status); 1944 charIdx += 5; 1945 break; 1946 } 1947 if (testString.compare(charIdx-1, 6, "<sent>") == 0) { 1948 delete tp.bi; 1949 tp.bi = NULL; 1950 tp.bi = BreakIterator::createSentenceInstance(locale, status); 1951 charIdx += 5; 1952 break; 1953 } 1954 if (testString.compare(charIdx-1, 7, "<title>") == 0) { 1955 delete tp.bi; 1956 tp.bi = BreakIterator::createTitleInstance(locale, status); 1957 charIdx += 6; 1958 break; 1959 } 1960 1961 // <locale loc_name> 1962 localeMatcher.reset(testString); 1963 if (localeMatcher.lookingAt(charIdx-1, status)) { 1964 UnicodeString localeName = localeMatcher.group(1, status); 1965 char localeName8[100]; 1966 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0); 1967 locale = Locale::createFromName(localeName8); 1968 charIdx += localeMatcher.group(0, status).length(); 1969 TEST_ASSERT_SUCCESS(status); 1970 break; 1971 } 1972 if (testString.compare(charIdx-1, 6, "<data>") == 0) { 1973 parseState = PARSE_DATA; 1974 charIdx += 5; 1975 tp.dataToBreak = ""; 1976 tp.expectedBreaks->removeAllElements(); 1977 tp.srcCol ->removeAllElements(); 1978 tp.srcLine->removeAllElements(); 1979 break; 1980 } 1981 1982 errln("line %d: Tag expected in test file.", lineNum); 1983 parseState = PARSE_COMMENT; 1984 savedState = PARSE_DATA; 1985 goto end_test; // Stop the test. 1986 } 1987 break; 1988 1989 case PARSE_DATA: 1990 if (c == CH_BULLET) { 1991 int32_t breakIdx = tp.dataToBreak.length(); 1992 tp.expectedBreaks->setSize(breakIdx+1); 1993 tp.expectedBreaks->setElementAt(-1, breakIdx); 1994 tp.srcLine->setSize(breakIdx+1); 1995 tp.srcLine->setElementAt(lineNum, breakIdx); 1996 tp.srcCol ->setSize(breakIdx+1); 1997 tp.srcCol ->setElementAt(column, breakIdx); 1998 break; 1999 } 2000 2001 if (testString.compare(charIdx-1, 7, "</data>") == 0) { 2002 // Add final entry to mappings from break location to source file position. 2003 // Need one extra because last break position returned is after the 2004 // last char in the data, not at the last char. 2005 tp.srcLine->addElement(lineNum, status); 2006 tp.srcCol ->addElement(column, status); 2007 2008 parseState = PARSE_TAG; 2009 charIdx += 6; 2010 2011 // RUN THE TEST! 2012 executeTest(&tp); 2013 break; 2014 } 2015 2016 if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) { 2017 // Named character, e.g. \N{COMBINING GRAVE ACCENT} 2018 // Get the code point from the name and insert it into the test data. 2019 // (Damn, no API takes names in Unicode !!! 2020 // we've got to take it back to char *) 2021 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx); 2022 int32_t nameLength = nameEndIdx - (charIdx+2); 2023 char charNameBuf[200]; 2024 UChar32 theChar = -1; 2025 if (nameEndIdx != -1) { 2026 UErrorCode status = U_ZERO_ERROR; 2027 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf)); 2028 charNameBuf[sizeof(charNameBuf)-1] = 0; 2029 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status); 2030 if (U_FAILURE(status)) { 2031 theChar = -1; 2032 } 2033 } 2034 if (theChar == -1) { 2035 errln("Error in named character in test file at line %d, col %d", 2036 lineNum, column); 2037 } else { 2038 // Named code point was recognized. Insert it 2039 // into the test data. 2040 tp.dataToBreak.append(theChar); 2041 while (tp.dataToBreak.length() > tp.srcLine->size()) { 2042 tp.srcLine->addElement(lineNum, status); 2043 tp.srcCol ->addElement(column, status); 2044 } 2045 } 2046 if (nameEndIdx > charIdx) { 2047 charIdx = nameEndIdx+1; 2048 2049 } 2050 break; 2051 } 2052 2053 2054 2055 2056 if (testString.compare(charIdx-1, 2, "<>") == 0) { 2057 charIdx++; 2058 int32_t breakIdx = tp.dataToBreak.length(); 2059 tp.expectedBreaks->setSize(breakIdx+1); 2060 tp.expectedBreaks->setElementAt(-1, breakIdx); 2061 tp.srcLine->setSize(breakIdx+1); 2062 tp.srcLine->setElementAt(lineNum, breakIdx); 2063 tp.srcCol ->setSize(breakIdx+1); 2064 tp.srcCol ->setElementAt(column, breakIdx); 2065 break; 2066 } 2067 2068 if (c == CH_LT) { 2069 tagValue = 0; 2070 parseState = PARSE_NUM; 2071 break; 2072 } 2073 2074 if (c == CH_HASH && column==3) { // TODO: why is column off so far? 2075 parseState = PARSE_COMMENT; 2076 savedState = PARSE_DATA; 2077 break; 2078 } 2079 2080 if (c == CH_BACKSLASH) { 2081 // Check for \ at end of line, a line continuation. 2082 // Advance over (discard) the newline 2083 UChar32 cp = testString.char32At(charIdx); 2084 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) { 2085 // We have a CR LF 2086 // Need an extra increment of the input ptr to move over both of them 2087 charIdx++; 2088 } 2089 if (cp == CH_LF || cp == CH_CR) { 2090 lineNum++; 2091 colStart = charIdx; 2092 charIdx++; 2093 break; 2094 } 2095 2096 // Let unescape handle the back slash. 2097 cp = testString.unescapeAt(charIdx); 2098 if (cp != -1) { 2099 // Escape sequence was recognized. Insert the char 2100 // into the test data. 2101 tp.dataToBreak.append(cp); 2102 while (tp.dataToBreak.length() > tp.srcLine->size()) { 2103 tp.srcLine->addElement(lineNum, status); 2104 tp.srcCol ->addElement(column, status); 2105 } 2106 break; 2107 } 2108 2109 2110 // Not a recognized backslash escape sequence. 2111 // Take the next char as a literal. 2112 // TODO: Should this be an error? 2113 c = testString.charAt(charIdx); 2114 charIdx = testString.moveIndex32(charIdx, 1); 2115 } 2116 2117 // Normal, non-escaped data char. 2118 tp.dataToBreak.append(c); 2119 2120 // Save the mapping from offset in the data to line/column numbers in 2121 // the original input file. Will be used for better error messages only. 2122 // If there's an expected break before this char, the slot in the mapping 2123 // vector will already be set for this char; don't overwrite it. 2124 if (tp.dataToBreak.length() > tp.srcLine->size()) { 2125 tp.srcLine->addElement(lineNum, status); 2126 tp.srcCol ->addElement(column, status); 2127 } 2128 break; 2129 2130 2131 case PARSE_NUM: 2132 // We are parsing an expected numeric tag value, like <1234>, 2133 // within a chunk of data. 2134 if (u_isUWhiteSpace(c)) { 2135 break; 2136 } 2137 2138 if (c == CH_GT) { 2139 // Finished the number. Add the info to the expected break data, 2140 // and switch parse state back to doing plain data. 2141 parseState = PARSE_DATA; 2142 if (tagValue == 0) { 2143 tagValue = -1; 2144 } 2145 int32_t breakIdx = tp.dataToBreak.length(); 2146 tp.expectedBreaks->setSize(breakIdx+1); 2147 tp.expectedBreaks->setElementAt(tagValue, breakIdx); 2148 tp.srcLine->setSize(breakIdx+1); 2149 tp.srcLine->setElementAt(lineNum, breakIdx); 2150 tp.srcCol ->setSize(breakIdx+1); 2151 tp.srcCol ->setElementAt(column, breakIdx); 2152 break; 2153 } 2154 2155 if (u_isdigit(c)) { 2156 tagValue = tagValue*10 + u_charDigitValue(c); 2157 break; 2158 } 2159 2160 errln("Syntax Error in test file at line %d, col %d", 2161 lineNum, column); 2162 parseState = PARSE_COMMENT; 2163 goto end_test; // Stop the test 2164 break; 2165 } 2166 2167 2168 if (U_FAILURE(status)) { 2169 errln("ICU Error %s while parsing test file at line %d.", 2170 u_errorName(status), lineNum); 2171 status = U_ZERO_ERROR; 2172 goto end_test; // Stop the test 2173 } 2174 2175 } 2176 2177 end_test: 2178 delete tp.bi; 2179 delete tp.expectedBreaks; 2180 delete tp.srcLine; 2181 delete tp.srcCol; 2182 delete [] testFile; 2183 #endif 2184 } 2185 2186 void RBBITest::TestThaiBreaks() { 2187 UErrorCode status=U_ZERO_ERROR; 2188 BreakIterator* b; 2189 Locale locale = Locale("th"); 2190 int32_t p, index; 2191 UChar c[]= { 2192 0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B, 2193 0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19, 2194 0x0E16, 0x0E49, 0x0E33, 0x0000 2195 }; 2196 int32_t expectedWordResult[] = { 2197 2, 3, 6, 10, 11, 15, 17, 20, 22 2198 }; 2199 int32_t expectedLineResult[] = { 2200 3, 6, 11, 15, 17, 20, 22 2201 }; 2202 2203 int32_t size = u_strlen(c); 2204 UnicodeString text=UnicodeString(c); 2205 2206 b = BreakIterator::createWordInstance(locale, status); 2207 if (U_FAILURE(status)) { 2208 errcheckln(status, "Unable to create thai word break iterator. - %s", u_errorName(status)); 2209 return; 2210 } 2211 b->setText(text); 2212 p = index = 0; 2213 while ((p=b->next())!=BreakIterator::DONE && p < size) { 2214 if (p != expectedWordResult[index++]) { 2215 errln("Incorrect break given by thai word break iterator. Expected: %d Got: %d", expectedWordResult[index-1], p); 2216 } 2217 } 2218 delete b; 2219 2220 b = BreakIterator::createLineInstance(locale, status); 2221 if (U_FAILURE(status)) { 2222 printf("Unable to create thai line break iterator.\n"); 2223 return; 2224 } 2225 b->setText(text); 2226 p = index = 0; 2227 while ((p=b->next())!=BreakIterator::DONE && p < size) { 2228 if (p != expectedLineResult[index++]) { 2229 errln("Incorrect break given by thai line break iterator. Expected: %d Got: %d", expectedLineResult[index-1], p); 2230 } 2231 } 2232 2233 delete b; 2234 } 2235 2236 // UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX" 2237 // Words don't include colon or period (cldrbug #1969). 2238 static const char posxWordText[] = "Can't have breaks in xx:yy or struct.field for CS-types."; 2239 static const int32_t posxWordTOffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 23, 24, 26, 27, 29, 30, 36, 37, 42, 43, 46, 47, 49, 50, 55, 56 }; 2240 static const int32_t posxWordROffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 26, 27, 29, 30, 42, 43, 46, 47, 49, 50, 55, 56 }; 2241 2242 // UBreakIteratorType UBRK_WORD, Locale "ja" 2243 // Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009). 2244 static const char jaWordText[] = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF" 2245 "\\u304C\\u3042\\u308B\\u3002\\u5948\\u3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002"; 2246 #if 0 2247 static const int32_t jaWordTOffsets[] = { 2, 3, 7, 8, 14, 17, 18, 20, 21, 24, 27, 28 }; 2248 static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28 }; 2249 #endif 2250 // There's no separate Japanese word break iterator. Root is the same as Japanese. 2251 // Our dictionary-based iterator has to be tweaked to better handle U+3005, 2252 // U+3007, U+300B and some other cases. 2253 static const int32_t jaWordTOffsets[] = { 1, 2, 3, 4, 5, 7, 8, 12, 13, 14, 15, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 }; 2254 static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 7, 8, 12, 13, 14, 15, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 }; 2255 2256 // UBreakIteratorType UBRK_SENTENCE, Locale "el" 2257 // Add break after Greek question mark (cldrbug #2069). 2258 static const char elSentText[] = "\\u0391\\u03B2, \\u03B3\\u03B4; \\u0395 \\u03B6\\u03B7\\u037E \\u0398 \\u03B9\\u03BA. " 2259 "\\u039B\\u03BC \\u03BD\\u03BE! \\u039F\\u03C0, \\u03A1\\u03C2? \\u03A3"; 2260 static const int32_t elSentTOffsets[] = { 8, 14, 20, 27, 35, 36 }; 2261 static const int32_t elSentROffsets[] = { 20, 27, 35, 36 }; 2262 2263 // UBreakIteratorType UBRK_CHARACTER, Locale "th" 2264 // Clusters should not include spacing Thai/Lao vowels (prefix or postfix), except for [SARA] AM (cldrbug #2161). 2265 static const char thCharText[] = "\\u0E01\\u0E23\\u0E30\\u0E17\\u0E48\\u0E2D\\u0E21\\u0E23\\u0E08\\u0E19\\u0E32 " 2266 "(\\u0E2A\\u0E38\\u0E0A\\u0E32\\u0E15\\u0E34-\\u0E08\\u0E38\\u0E11\\u0E32\\u0E21\\u0E32\\u0E28) " 2267 "\\u0E40\\u0E14\\u0E47\\u0E01\\u0E21\\u0E35\\u0E1B\\u0E31\\u0E0D\\u0E2B\\u0E32 "; 2268 static const int32_t thCharTOffsets[] = { 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 2269 12, 13, 15, 16, 17, 19, 20, 22, 23, 24, 25, 26, 27, 28, 2270 29, 30, 32, 33, 35, 37, 38, 39, 40, 41 }; 2271 static const int32_t thCharROffsets[] = { 1, 3, 5, 6, 7, 8, 9, 11, 2272 12, 13, 15, 17, 19, 20, 22, 24, 26, 27, 28, 2273 29, 32, 33, 35, 37, 38, 40, 41 }; 2274 2275 typedef struct { 2276 UBreakIteratorType type; 2277 const char * locale; 2278 const char * escapedText; 2279 const int32_t * tailoredOffsets; 2280 int32_t tailoredOffsetsCount; 2281 const int32_t * rootOffsets; 2282 int32_t rootOffsetsCount; 2283 } TailoredBreakItem; 2284 2285 #define ARRAY_PTR_LEN(array) (array),(sizeof(array)/sizeof(array[0])) 2286 2287 static const TailoredBreakItem tbItems[] = { 2288 { UBRK_WORD, "en_US_POSIX", posxWordText, ARRAY_PTR_LEN(posxWordTOffsets), ARRAY_PTR_LEN(posxWordROffsets) }, 2289 { UBRK_WORD, "ja", jaWordText, ARRAY_PTR_LEN(jaWordTOffsets), ARRAY_PTR_LEN(jaWordROffsets) }, 2290 { UBRK_SENTENCE, "el", elSentText, ARRAY_PTR_LEN(elSentTOffsets), ARRAY_PTR_LEN(elSentROffsets) }, 2291 { UBRK_CHARACTER, "th", thCharText, ARRAY_PTR_LEN(thCharTOffsets), ARRAY_PTR_LEN(thCharROffsets) }, 2292 { UBRK_CHARACTER, NULL, NULL, NULL,0, NULL,0 } // terminator 2293 }; 2294 2295 static void formatOffsets(char* buffer, int32_t buflen, int32_t count, const int32_t* offsets) { 2296 while (count-- > 0) { 2297 int writeCount; 2298 sprintf(buffer, /* buflen, */ " %d%n", *offsets++, &writeCount); /* wants to be snprintf */ 2299 buffer += writeCount; 2300 buflen -= writeCount; 2301 } 2302 } 2303 2304 enum { kMaxOffsetCount = 128 }; 2305 2306 void RBBITest::TBTest(BreakIterator* brkitr, int type, const char *locale, const char* escapedText, const int32_t *expectOffsets, int32_t expectOffsetsCount) { 2307 brkitr->setText( CharsToUnicodeString(escapedText) ); 2308 int32_t foundOffsets[kMaxOffsetCount]; 2309 int32_t offset, foundOffsetsCount = 0; 2310 // do forwards iteration test 2311 while ( foundOffsetsCount < kMaxOffsetCount && (offset = brkitr->next()) != BreakIterator::DONE ) { 2312 foundOffsets[foundOffsetsCount++] = offset; 2313 } 2314 if ( foundOffsetsCount != expectOffsetsCount || memcmp(expectOffsets, foundOffsets, foundOffsetsCount*sizeof(foundOffsets[0])) != 0 ) { 2315 // log error for forwards test 2316 char formatExpect[512], formatFound[512]; 2317 formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets); 2318 formatOffsets(formatFound, sizeof(formatFound), foundOffsetsCount, foundOffsets); 2319 errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found %d offsets fwd:%s\n", 2320 type, locale, escapedText, expectOffsetsCount, formatExpect, foundOffsetsCount, formatFound); 2321 } else { 2322 // do backwards iteration test 2323 --foundOffsetsCount; // back off one from the end offset 2324 while ( foundOffsetsCount > 0 ) { 2325 offset = brkitr->previous(); 2326 if ( offset != foundOffsets[--foundOffsetsCount] ) { 2327 // log error for backwards test 2328 char formatExpect[512]; 2329 formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets); 2330 errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found rev offset %d where expect %d\n", 2331 type, locale, escapedText, expectOffsetsCount, formatExpect, offset, foundOffsets[foundOffsetsCount]); 2332 break; 2333 } 2334 } 2335 } 2336 } 2337 2338 void RBBITest::TestTailoredBreaks() { 2339 const TailoredBreakItem * tbItemPtr; 2340 Locale rootLocale = Locale("root"); 2341 for (tbItemPtr = tbItems; tbItemPtr->escapedText != NULL; ++tbItemPtr) { 2342 Locale testLocale = Locale(tbItemPtr->locale); 2343 BreakIterator * tailoredBrkiter = NULL; 2344 BreakIterator * rootBrkiter = NULL; 2345 UErrorCode status = U_ZERO_ERROR; 2346 switch (tbItemPtr->type) { 2347 case UBRK_CHARACTER: 2348 tailoredBrkiter = BreakIterator::createCharacterInstance(testLocale, status); 2349 rootBrkiter = BreakIterator::createCharacterInstance(rootLocale, status); 2350 break; 2351 case UBRK_WORD: 2352 tailoredBrkiter = BreakIterator::createWordInstance(testLocale, status); 2353 rootBrkiter = BreakIterator::createWordInstance(rootLocale, status); 2354 break; 2355 case UBRK_LINE: 2356 tailoredBrkiter = BreakIterator::createLineInstance(testLocale, status); 2357 rootBrkiter = BreakIterator::createLineInstance(rootLocale, status); 2358 break; 2359 case UBRK_SENTENCE: 2360 tailoredBrkiter = BreakIterator::createSentenceInstance(testLocale, status); 2361 rootBrkiter = BreakIterator::createSentenceInstance(rootLocale, status); 2362 break; 2363 default: 2364 status = U_UNSUPPORTED_ERROR; 2365 break; 2366 } 2367 if (U_FAILURE(status)) { 2368 errcheckln(status, "BreakIterator create failed for type %d, locales root or %s - Error: %s", (int)(tbItemPtr->type), tbItemPtr->locale, u_errorName(status)); 2369 continue; 2370 } 2371 TBTest(tailoredBrkiter, (int)(tbItemPtr->type), tbItemPtr->locale, tbItemPtr->escapedText, tbItemPtr->tailoredOffsets, tbItemPtr->tailoredOffsetsCount); 2372 TBTest(rootBrkiter, (int)(tbItemPtr->type), "root", tbItemPtr->escapedText, tbItemPtr->rootOffsets, tbItemPtr->rootOffsetsCount); 2373 2374 delete rootBrkiter; 2375 delete tailoredBrkiter; 2376 } 2377 } 2378 2379 2380 //------------------------------------------------------------------------------- 2381 // 2382 // TestDictRules create a break iterator from source rules that includes a 2383 // dictionary range. Regression for bug #7130. Source rules 2384 // do not declare a break iterator type (word, line, sentence, etc. 2385 // but the dictionary code, without a type, would loop. 2386 // 2387 //------------------------------------------------------------------------------- 2388 void RBBITest::TestDictRules() { 2389 const char *rules = "$dictionary = [a-z]; \n" 2390 "!!forward; \n" 2391 "$dictionary $dictionary; \n" 2392 "!!reverse; \n" 2393 "$dictionary $dictionary; \n"; 2394 const char *text = "aa"; 2395 UErrorCode status = U_ZERO_ERROR; 2396 UParseError parseError; 2397 2398 RuleBasedBreakIterator bi(rules, parseError, status); 2399 if (U_SUCCESS(status)) { 2400 UnicodeString utext = text; 2401 bi.setText(utext); 2402 int32_t position; 2403 int32_t loops; 2404 for (loops = 0; loops<10; loops++) { 2405 position = bi.next(); 2406 if (position == RuleBasedBreakIterator::DONE) { 2407 break; 2408 } 2409 } 2410 TEST_ASSERT(loops == 1); 2411 } else { 2412 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status)); 2413 } 2414 } 2415 2416 2417 2418 //------------------------------------------------------------------------------- 2419 // 2420 // ReadAndConvertFile Read a text data file, convert it to UChars, and 2421 // return the datain one big UChar * buffer, which the caller must delete. 2422 // 2423 // parameters: 2424 // fileName: the name of the file, with no directory part. The test data directory 2425 // is assumed. 2426 // ulen an out parameter, receives the actual length (in UChars) of the file data. 2427 // encoding The file encoding. If the file contains a BOM, that will override the encoding 2428 // specified here. The BOM, if it exists, will be stripped from the returned data. 2429 // Pass NULL for the system default encoding. 2430 // status 2431 // returns: 2432 // The file data, converted to UChar. 2433 // The caller must delete this when done with 2434 // delete [] theBuffer; 2435 // 2436 // TODO: This is a clone of RegexTest::ReadAndConvertFile. 2437 // Move this function to some common place. 2438 // 2439 //-------------------------------------------------------------------------------- 2440 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) { 2441 UChar *retPtr = NULL; 2442 char *fileBuf = NULL; 2443 UConverter* conv = NULL; 2444 FILE *f = NULL; 2445 2446 ulen = 0; 2447 if (U_FAILURE(status)) { 2448 return retPtr; 2449 } 2450 2451 // 2452 // Open the file. 2453 // 2454 f = fopen(fileName, "rb"); 2455 if (f == 0) { 2456 dataerrln("Error opening test data file %s\n", fileName); 2457 status = U_FILE_ACCESS_ERROR; 2458 return NULL; 2459 } 2460 // 2461 // Read it in 2462 // 2463 int fileSize; 2464 int amt_read; 2465 2466 fseek( f, 0, SEEK_END); 2467 fileSize = ftell(f); 2468 fileBuf = new char[fileSize]; 2469 fseek(f, 0, SEEK_SET); 2470 amt_read = fread(fileBuf, 1, fileSize, f); 2471 if (amt_read != fileSize || fileSize <= 0) { 2472 errln("Error reading test data file."); 2473 goto cleanUpAndReturn; 2474 } 2475 2476 // 2477 // Look for a Unicode Signature (BOM) on the data just read 2478 // 2479 int32_t signatureLength; 2480 const char * fileBufC; 2481 const char* bomEncoding; 2482 2483 fileBufC = fileBuf; 2484 bomEncoding = ucnv_detectUnicodeSignature( 2485 fileBuf, fileSize, &signatureLength, &status); 2486 if(bomEncoding!=NULL ){ 2487 fileBufC += signatureLength; 2488 fileSize -= signatureLength; 2489 encoding = bomEncoding; 2490 } 2491 2492 // 2493 // Open a converter to take the rule file to UTF-16 2494 // 2495 conv = ucnv_open(encoding, &status); 2496 if (U_FAILURE(status)) { 2497 goto cleanUpAndReturn; 2498 } 2499 2500 // 2501 // Convert the rules to UChar. 2502 // Preflight first to determine required buffer size. 2503 // 2504 ulen = ucnv_toUChars(conv, 2505 NULL, // dest, 2506 0, // destCapacity, 2507 fileBufC, 2508 fileSize, 2509 &status); 2510 if (status == U_BUFFER_OVERFLOW_ERROR) { 2511 // Buffer Overflow is expected from the preflight operation. 2512 status = U_ZERO_ERROR; 2513 2514 retPtr = new UChar[ulen+1]; 2515 ucnv_toUChars(conv, 2516 retPtr, // dest, 2517 ulen+1, 2518 fileBufC, 2519 fileSize, 2520 &status); 2521 } 2522 2523 cleanUpAndReturn: 2524 fclose(f); 2525 delete []fileBuf; 2526 ucnv_close(conv); 2527 if (U_FAILURE(status)) { 2528 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 2529 delete retPtr; 2530 retPtr = 0; 2531 ulen = 0; 2532 }; 2533 return retPtr; 2534 } 2535 2536 2537 2538 //-------------------------------------------------------------------------------------------- 2539 // 2540 // Run tests from each of the boundary test data files distributed by the Unicode Consortium 2541 // 2542 //------------------------------------------------------------------------------------------- 2543 void RBBITest::TestUnicodeFiles() { 2544 RuleBasedBreakIterator *bi; 2545 UErrorCode status = U_ZERO_ERROR; 2546 2547 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status); 2548 TEST_ASSERT_SUCCESS(status); 2549 if (U_SUCCESS(status)) { 2550 runUnicodeTestData("GraphemeBreakTest.txt", bi); 2551 } 2552 delete bi; 2553 2554 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status); 2555 TEST_ASSERT_SUCCESS(status); 2556 if (U_SUCCESS(status)) { 2557 runUnicodeTestData("WordBreakTest.txt", bi); 2558 } 2559 delete bi; 2560 2561 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status); 2562 TEST_ASSERT_SUCCESS(status); 2563 if (U_SUCCESS(status)) { 2564 runUnicodeTestData("SentenceBreakTest.txt", bi); 2565 } 2566 delete bi; 2567 2568 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status); 2569 TEST_ASSERT_SUCCESS(status); 2570 if (U_SUCCESS(status)) { 2571 runUnicodeTestData("LineBreakTest.txt", bi); 2572 } 2573 delete bi; 2574 } 2575 2576 2577 //-------------------------------------------------------------------------------------------- 2578 // 2579 // Run tests from one of the boundary test data files distributed by the Unicode Consortium 2580 // 2581 //------------------------------------------------------------------------------------------- 2582 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) { 2583 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 2584 // TODO(andy): Match line break behavior to Unicode 6.0 and remove this time bomb. 2585 UVersionInfo icu4601 = { 4, 6, 0, 1 }; 2586 UBool isICUVersionPast46 = isICUVersionAtLeast(icu4601); 2587 UBool isLineBreak = 0 == strcmp(fileName, "LineBreakTest.txt"); 2588 UErrorCode status = U_ZERO_ERROR; 2589 2590 // 2591 // Open and read the test data file, put it into a UnicodeString. 2592 // 2593 const char *testDataDirectory = IntlTest::getSourceTestData(status); 2594 char testFileName[1000]; 2595 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) { 2596 dataerrln("Can't open test data. Path too long."); 2597 return; 2598 } 2599 strcpy(testFileName, testDataDirectory); 2600 strcat(testFileName, fileName); 2601 2602 logln("Opening data file %s\n", fileName); 2603 2604 int len; 2605 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status); 2606 if (status != U_FILE_ACCESS_ERROR) { 2607 TEST_ASSERT_SUCCESS(status); 2608 TEST_ASSERT(testFile != NULL); 2609 } 2610 if (U_FAILURE(status) || testFile == NULL) { 2611 return; /* something went wrong, error already output */ 2612 } 2613 UnicodeString testFileAsString(TRUE, testFile, len); 2614 2615 // 2616 // Parse the test data file using a regular expression. 2617 // Each kind of token is recognized in its own capture group; what type of item was scanned 2618 // is identified by which group had a match. 2619 // 2620 // Caputure Group # 1 2 3 4 5 2621 // Parses this item: divide x hex digits comment \n unrecognized \n 2622 // 2623 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV); 2624 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status); 2625 UnicodeString testString; 2626 UVector32 breakPositions(status); 2627 int lineNumber = 1; 2628 TEST_ASSERT_SUCCESS(status); 2629 if (U_FAILURE(status)) { 2630 return; 2631 } 2632 2633 // 2634 // Scan through each test case, building up the string to be broken in testString, 2635 // and the positions that should be boundaries in the breakPositions vector. 2636 // 2637 int spin = 0; 2638 while (tokenMatcher.find()) { 2639 if(tokenMatcher.hitEnd()) { 2640 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for. 2641 This occurred when the text file was corrupt (wasn't marked as UTF-8) 2642 and caused an infinite loop here on EBCDIC systems! 2643 */ 2644 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin); 2645 // return; 2646 } 2647 if (tokenMatcher.start(1, status) >= 0) { 2648 // Scanned a divide sign, indicating a break position in the test data. 2649 if (testString.length()>0) { 2650 breakPositions.addElement(testString.length(), status); 2651 } 2652 } 2653 else if (tokenMatcher.start(2, status) >= 0) { 2654 // Scanned an 'x', meaning no break at this position in the test data 2655 // Nothing to be done here. 2656 } 2657 else if (tokenMatcher.start(3, status) >= 0) { 2658 // Scanned Hex digits. Convert them to binary, append to the character data string. 2659 const UnicodeString &hexNumber = tokenMatcher.group(3, status); 2660 int length = hexNumber.length(); 2661 if (length<=8) { 2662 char buf[10]; 2663 hexNumber.extract (0, length, buf, sizeof(buf), US_INV); 2664 UChar32 c = (UChar32)strtol(buf, NULL, 16); 2665 if (c<=0x10ffff) { 2666 testString.append(c); 2667 } else { 2668 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n", 2669 fileName, lineNumber); 2670 } 2671 } else { 2672 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n", 2673 fileName, lineNumber); 2674 } 2675 } 2676 else if (tokenMatcher.start(4, status) >= 0) { 2677 // Scanned to end of a line, possibly skipping over a comment in the process. 2678 // If the line from the file contained test data, run the test now. 2679 // 2680 if (testString.length() > 0) { 2681 // TODO(andy): Remove this time bomb code. 2682 if (!isLineBreak || isICUVersionPast46 || !(4658 <= lineNumber && lineNumber <= 4758)) { 2683 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi); 2684 } 2685 } 2686 2687 // Clear out this test case. 2688 // The string and breakPositions vector will be refilled as the next 2689 // test case is parsed. 2690 testString.remove(); 2691 breakPositions.removeAllElements(); 2692 lineNumber++; 2693 } else { 2694 // Scanner catchall. Something unrecognized appeared on the line. 2695 char token[16]; 2696 UnicodeString uToken = tokenMatcher.group(0, status); 2697 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token)); 2698 token[sizeof(token)-1] = 0; 2699 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token); 2700 2701 // Clean up, in preparation for continuing with the next line. 2702 testString.remove(); 2703 breakPositions.removeAllElements(); 2704 lineNumber++; 2705 } 2706 TEST_ASSERT_SUCCESS(status); 2707 if (U_FAILURE(status)) { 2708 break; 2709 } 2710 } 2711 2712 delete [] testFile; 2713 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 2714 } 2715 2716 //-------------------------------------------------------------------------------------------- 2717 // 2718 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium 2719 // test data files. Do only a simple, forward-only check - 2720 // this test is mostly to check that ICU and the Unicode 2721 // data agree with each other. 2722 // 2723 //-------------------------------------------------------------------------------------------- 2724 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber, 2725 const UnicodeString &testString, // Text data to be broken 2726 UVector32 *breakPositions, // Positions where breaks should be found. 2727 RuleBasedBreakIterator *bi) { 2728 int32_t pos; // Break Position in the test string 2729 int32_t expectedI = 0; // Index of expected break position in the vector of expected results. 2730 int32_t expectedPos; // Expected break position (index into test string) 2731 2732 bi->setText(testString); 2733 pos = bi->first(); 2734 pos = bi->next(); 2735 2736 while (pos != BreakIterator::DONE) { 2737 if (expectedI >= breakPositions->size()) { 2738 errln("Test file \"%s\", line %d, unexpected break found at position %d", 2739 testFileName, lineNumber, pos); 2740 break; 2741 } 2742 expectedPos = breakPositions->elementAti(expectedI); 2743 if (pos < expectedPos) { 2744 errln("Test file \"%s\", line %d, unexpected break found at position %d", 2745 testFileName, lineNumber, pos); 2746 break; 2747 } 2748 if (pos > expectedPos) { 2749 errln("Test file \"%s\", line %d, failed to find expected break at position %d", 2750 testFileName, lineNumber, expectedPos); 2751 break; 2752 } 2753 pos = bi->next(); 2754 expectedI++; 2755 } 2756 2757 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) { 2758 errln("Test file \"%s\", line %d, failed to find expected break at position %d", 2759 testFileName, lineNumber, breakPositions->elementAti(expectedI)); 2760 } 2761 } 2762 2763 2764 2765 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 2766 //--------------------------------------------------------------------------------------- 2767 // 2768 // classs RBBIMonkeyKind 2769 // 2770 // Monkey Test for Break Iteration 2771 // Abstract interface class. Concrete derived classes independently 2772 // implement the break rules for different iterator types. 2773 // 2774 // The Monkey Test itself uses doesn't know which type of break iterator it is 2775 // testing, but works purely in terms of the interface defined here. 2776 // 2777 //--------------------------------------------------------------------------------------- 2778 class RBBIMonkeyKind { 2779 public: 2780 // Return a UVector of UnicodeSets, representing the character classes used 2781 // for this type of iterator. 2782 virtual UVector *charClasses() = 0; 2783 2784 // Set the test text on which subsequent calls to next() will operate 2785 virtual void setText(const UnicodeString &s) = 0; 2786 2787 // Find the next break postion, starting from the prev break position, or from zero. 2788 // Return -1 after reaching end of string. 2789 virtual int32_t next(int32_t i) = 0; 2790 2791 virtual ~RBBIMonkeyKind(); 2792 UErrorCode deferredStatus; 2793 2794 2795 protected: 2796 RBBIMonkeyKind(); 2797 2798 private: 2799 }; 2800 2801 RBBIMonkeyKind::RBBIMonkeyKind() { 2802 deferredStatus = U_ZERO_ERROR; 2803 } 2804 2805 RBBIMonkeyKind::~RBBIMonkeyKind() { 2806 } 2807 2808 2809 //---------------------------------------------------------------------------------------- 2810 // 2811 // Random Numbers. Similar to standard lib rand() and srand() 2812 // Not using library to 2813 // 1. Get same results on all platforms. 2814 // 2. Get access to current seed, to more easily reproduce failures. 2815 // 2816 //--------------------------------------------------------------------------------------- 2817 static uint32_t m_seed = 1; 2818 2819 static uint32_t m_rand() 2820 { 2821 m_seed = m_seed * 1103515245 + 12345; 2822 return (uint32_t)(m_seed/65536) % 32768; 2823 } 2824 2825 2826 //------------------------------------------------------------------------------------------ 2827 // 2828 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation 2829 // of RBBIMonkeyKind. 2830 // 2831 //------------------------------------------------------------------------------------------ 2832 class RBBICharMonkey: public RBBIMonkeyKind { 2833 public: 2834 RBBICharMonkey(); 2835 virtual ~RBBICharMonkey(); 2836 virtual UVector *charClasses(); 2837 virtual void setText(const UnicodeString &s); 2838 virtual int32_t next(int32_t i); 2839 private: 2840 UVector *fSets; 2841 2842 UnicodeSet *fCRLFSet; 2843 UnicodeSet *fControlSet; 2844 UnicodeSet *fExtendSet; 2845 UnicodeSet *fPrependSet; 2846 UnicodeSet *fSpacingSet; 2847 UnicodeSet *fLSet; 2848 UnicodeSet *fVSet; 2849 UnicodeSet *fTSet; 2850 UnicodeSet *fLVSet; 2851 UnicodeSet *fLVTSet; 2852 UnicodeSet *fHangulSet; 2853 UnicodeSet *fAnySet; 2854 2855 const UnicodeString *fText; 2856 }; 2857 2858 2859 RBBICharMonkey::RBBICharMonkey() { 2860 UErrorCode status = U_ZERO_ERROR; 2861 2862 fText = NULL; 2863 2864 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status); 2865 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status); 2866 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status); 2867 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status); 2868 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status); 2869 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status); 2870 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status); 2871 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status); 2872 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status); 2873 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status); 2874 fHangulSet = new UnicodeSet(); 2875 fHangulSet->addAll(*fLSet); 2876 fHangulSet->addAll(*fVSet); 2877 fHangulSet->addAll(*fTSet); 2878 fHangulSet->addAll(*fLVSet); 2879 fHangulSet->addAll(*fLVTSet); 2880 fAnySet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u0000-\\U0010ffff]"), status); 2881 2882 fSets = new UVector(status); 2883 fSets->addElement(fCRLFSet, status); 2884 fSets->addElement(fControlSet, status); 2885 fSets->addElement(fExtendSet, status); 2886 fSets->addElement(fPrependSet, status); 2887 fSets->addElement(fSpacingSet, status); 2888 fSets->addElement(fHangulSet, status); 2889 fSets->addElement(fAnySet, status); 2890 if (U_FAILURE(status)) { 2891 deferredStatus = status; 2892 } 2893 } 2894 2895 2896 void RBBICharMonkey::setText(const UnicodeString &s) { 2897 fText = &s; 2898 } 2899 2900 2901 2902 int32_t RBBICharMonkey::next(int32_t prevPos) { 2903 int p0, p1, p2, p3; // Indices of the significant code points around the 2904 // break position being tested. The candidate break 2905 // location is before p2. 2906 2907 int breakPos = -1; 2908 2909 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 2910 2911 if (U_FAILURE(deferredStatus)) { 2912 return -1; 2913 } 2914 2915 // Previous break at end of string. return DONE. 2916 if (prevPos >= fText->length()) { 2917 return -1; 2918 } 2919 p0 = p1 = p2 = p3 = prevPos; 2920 c3 = fText->char32At(prevPos); 2921 c0 = c1 = c2 = 0; 2922 2923 // Loop runs once per "significant" character position in the input text. 2924 for (;;) { 2925 // Move all of the positions forward in the input string. 2926 p0 = p1; c0 = c1; 2927 p1 = p2; c1 = c2; 2928 p2 = p3; c2 = c3; 2929 2930 // Advancd p3 by one codepoint 2931 p3 = fText->moveIndex32(p3, 1); 2932 c3 = fText->char32At(p3); 2933 2934 if (p1 == p2) { 2935 // Still warming up the loop. (won't work with zero length strings, but we don't care) 2936 continue; 2937 } 2938 if (p2 == fText->length()) { 2939 // Reached end of string. Always a break position. 2940 break; 2941 } 2942 2943 // Rule GB3 CR x LF 2944 // No Extend or Format characters may appear between the CR and LF, 2945 // which requires the additional check for p2 immediately following p1. 2946 // 2947 if (c1==0x0D && c2==0x0A && p1==(p2-1)) { 2948 continue; 2949 } 2950 2951 // Rule (GB4). ( Control | CR | LF ) <break> 2952 if (fControlSet->contains(c1) || 2953 c1 == 0x0D || 2954 c1 == 0x0A) { 2955 break; 2956 } 2957 2958 // Rule (GB5) <break> ( Control | CR | LF ) 2959 // 2960 if (fControlSet->contains(c2) || 2961 c2 == 0x0D || 2962 c2 == 0x0A) { 2963 break; 2964 } 2965 2966 2967 // Rule (GB6) L x ( L | V | LV | LVT ) 2968 if (fLSet->contains(c1) && 2969 (fLSet->contains(c2) || 2970 fVSet->contains(c2) || 2971 fLVSet->contains(c2) || 2972 fLVTSet->contains(c2))) { 2973 continue; 2974 } 2975 2976 // Rule (GB7) ( LV | V ) x ( V | T ) 2977 if ((fLVSet->contains(c1) || fVSet->contains(c1)) && 2978 (fVSet->contains(c2) || fTSet->contains(c2))) { 2979 continue; 2980 } 2981 2982 // Rule (GB8) ( LVT | T) x T 2983 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) && 2984 fTSet->contains(c2)) { 2985 continue; 2986 } 2987 2988 // Rule (GB9) Numeric x ALetter 2989 if (fExtendSet->contains(c2)) { 2990 continue; 2991 } 2992 2993 // Rule (GB9a) x SpacingMark 2994 if (fSpacingSet->contains(c2)) { 2995 continue; 2996 } 2997 2998 // Rule (GB9b) Prepend x 2999 if (fPrependSet->contains(c1)) { 3000 continue; 3001 } 3002 3003 // Rule (GB10) Any <break> Any 3004 break; 3005 } 3006 3007 breakPos = p2; 3008 return breakPos; 3009 } 3010 3011 3012 3013 UVector *RBBICharMonkey::charClasses() { 3014 return fSets; 3015 } 3016 3017 3018 RBBICharMonkey::~RBBICharMonkey() { 3019 delete fSets; 3020 delete fCRLFSet; 3021 delete fControlSet; 3022 delete fExtendSet; 3023 delete fPrependSet; 3024 delete fSpacingSet; 3025 delete fLSet; 3026 delete fVSet; 3027 delete fTSet; 3028 delete fLVSet; 3029 delete fLVTSet; 3030 delete fHangulSet; 3031 delete fAnySet; 3032 } 3033 3034 //------------------------------------------------------------------------------------------ 3035 // 3036 // class RBBIWordMonkey Word Break specific implementation 3037 // of RBBIMonkeyKind. 3038 // 3039 //------------------------------------------------------------------------------------------ 3040 class RBBIWordMonkey: public RBBIMonkeyKind { 3041 public: 3042 RBBIWordMonkey(); 3043 virtual ~RBBIWordMonkey(); 3044 virtual UVector *charClasses(); 3045 virtual void setText(const UnicodeString &s); 3046 virtual int32_t next(int32_t i); 3047 private: 3048 UVector *fSets; 3049 3050 UnicodeSet *fCRSet; 3051 UnicodeSet *fLFSet; 3052 UnicodeSet *fNewlineSet; 3053 UnicodeSet *fKatakanaSet; 3054 UnicodeSet *fALetterSet; 3055 // TODO(jungshik): Do we still need this change? 3056 // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt 3057 UnicodeSet *fMidNumLetSet; 3058 UnicodeSet *fMidLetterSet; 3059 UnicodeSet *fMidNumSet; 3060 UnicodeSet *fNumericSet; 3061 UnicodeSet *fFormatSet; 3062 UnicodeSet *fOtherSet; 3063 UnicodeSet *fExtendSet; 3064 UnicodeSet *fExtendNumLetSet; 3065 UnicodeSet *fDictionaryCjkSet; 3066 3067 RegexMatcher *fMatcher; 3068 3069 const UnicodeString *fText; 3070 }; 3071 3072 3073 RBBIWordMonkey::RBBIWordMonkey() 3074 { 3075 UErrorCode status = U_ZERO_ERROR; 3076 3077 fSets = new UVector(status); 3078 3079 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status); 3080 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status); 3081 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status); 3082 fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status); 3083 // Exclude Hangul syllables from ALetterSet during testing. 3084 // Leave CJK dictionary characters out from the monkey tests! 3085 #if 0 3086 fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}" 3087 "[\\p{Line_Break = Complex_Context}" 3088 "-\\p{Grapheme_Cluster_Break = Extend}" 3089 "-\\p{Grapheme_Cluster_Break = Control}" 3090 "]]", 3091 status); 3092 #endif 3093 fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status); 3094 fALetterSet->removeAll(*fDictionaryCjkSet); 3095 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status); 3096 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status); 3097 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status); 3098 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status); 3099 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}[\\uff10-\\uff19]]"), status); 3100 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status); 3101 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status); 3102 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status); 3103 3104 fOtherSet = new UnicodeSet(); 3105 if(U_FAILURE(status)) { 3106 deferredStatus = status; 3107 return; 3108 } 3109 3110 fOtherSet->complement(); 3111 fOtherSet->removeAll(*fCRSet); 3112 fOtherSet->removeAll(*fLFSet); 3113 fOtherSet->removeAll(*fNewlineSet); 3114 fOtherSet->removeAll(*fKatakanaSet); 3115 fOtherSet->removeAll(*fALetterSet); 3116 fOtherSet->removeAll(*fMidLetterSet); 3117 fOtherSet->removeAll(*fMidNumSet); 3118 fOtherSet->removeAll(*fNumericSet); 3119 fOtherSet->removeAll(*fExtendNumLetSet); 3120 fOtherSet->removeAll(*fFormatSet); 3121 fOtherSet->removeAll(*fExtendSet); 3122 // Inhibit dictionary characters from being tested at all. 3123 fOtherSet->removeAll(*fDictionaryCjkSet); 3124 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status)); 3125 3126 fSets->addElement(fCRSet, status); 3127 fSets->addElement(fLFSet, status); 3128 fSets->addElement(fNewlineSet, status); 3129 fSets->addElement(fALetterSet, status); 3130 //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test katakana 3131 fSets->addElement(fMidLetterSet, status); 3132 fSets->addElement(fMidNumLetSet, status); 3133 fSets->addElement(fMidNumSet, status); 3134 fSets->addElement(fNumericSet, status); 3135 fSets->addElement(fFormatSet, status); 3136 fSets->addElement(fExtendSet, status); 3137 fSets->addElement(fOtherSet, status); 3138 fSets->addElement(fExtendNumLetSet, status); 3139 3140 if (U_FAILURE(status)) { 3141 deferredStatus = status; 3142 } 3143 } 3144 3145 void RBBIWordMonkey::setText(const UnicodeString &s) { 3146 fText = &s; 3147 } 3148 3149 3150 int32_t RBBIWordMonkey::next(int32_t prevPos) { 3151 int p0, p1, p2, p3; // Indices of the significant code points around the 3152 // break position being tested. The candidate break 3153 // location is before p2. 3154 3155 int breakPos = -1; 3156 3157 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 3158 3159 if (U_FAILURE(deferredStatus)) { 3160 return -1; 3161 } 3162 3163 // Prev break at end of string. return DONE. 3164 if (prevPos >= fText->length()) { 3165 return -1; 3166 } 3167 p0 = p1 = p2 = p3 = prevPos; 3168 c3 = fText->char32At(prevPos); 3169 c0 = c1 = c2 = 0; 3170 3171 // Loop runs once per "significant" character position in the input text. 3172 for (;;) { 3173 // Move all of the positions forward in the input string. 3174 p0 = p1; c0 = c1; 3175 p1 = p2; c1 = c2; 3176 p2 = p3; c2 = c3; 3177 3178 // Advancd p3 by X(Extend | Format)* Rule 4 3179 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change) 3180 do { 3181 p3 = fText->moveIndex32(p3, 1); 3182 c3 = fText->char32At(p3); 3183 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { 3184 break; 3185 }; 3186 } 3187 while (fFormatSet->contains(c3) || fExtendSet->contains(c3)); 3188 3189 3190 if (p1 == p2) { 3191 // Still warming up the loop. (won't work with zero length strings, but we don't care) 3192 continue; 3193 } 3194 if (p2 == fText->length()) { 3195 // Reached end of string. Always a break position. 3196 break; 3197 } 3198 3199 // Rule (3) CR x LF 3200 // No Extend or Format characters may appear between the CR and LF, 3201 // which requires the additional check for p2 immediately following p1. 3202 // 3203 if (c1==0x0D && c2==0x0A) { 3204 continue; 3205 } 3206 3207 // Rule (3a) Break before and after newlines (including CR and LF) 3208 // 3209 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) { 3210 break; 3211 }; 3212 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { 3213 break; 3214 }; 3215 3216 // Rule (5). ALetter x ALetter 3217 if (fALetterSet->contains(c1) && 3218 fALetterSet->contains(c2)) { 3219 continue; 3220 } 3221 3222 // Rule (6) ALetter x (MidLetter | MidNumLet) ALetter 3223 // 3224 if ( fALetterSet->contains(c1) && 3225 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2)) && 3226 fALetterSet->contains(c3)) { 3227 continue; 3228 } 3229 3230 3231 // Rule (7) ALetter (MidLetter | MidNumLet) x ALetter 3232 if (fALetterSet->contains(c0) && 3233 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1)) && 3234 fALetterSet->contains(c2)) { 3235 continue; 3236 } 3237 3238 // Rule (8) Numeric x Numeric 3239 if (fNumericSet->contains(c1) && 3240 fNumericSet->contains(c2)) { 3241 continue; 3242 } 3243 3244 // Rule (9) ALetter x Numeric 3245 if (fALetterSet->contains(c1) && 3246 fNumericSet->contains(c2)) { 3247 continue; 3248 } 3249 3250 // Rule (10) Numeric x ALetter 3251 if (fNumericSet->contains(c1) && 3252 fALetterSet->contains(c2)) { 3253 continue; 3254 } 3255 3256 // Rule (11) Numeric (MidNum | MidNumLet) x Numeric 3257 if (fNumericSet->contains(c0) && 3258 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1)) && 3259 fNumericSet->contains(c2)) { 3260 continue; 3261 } 3262 3263 // Rule (12) Numeric x (MidNum | MidNumLet) Numeric 3264 if (fNumericSet->contains(c1) && 3265 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2)) && 3266 fNumericSet->contains(c3)) { 3267 continue; 3268 } 3269 3270 // Rule (13) Katakana x Katakana 3271 if (fKatakanaSet->contains(c1) && 3272 fKatakanaSet->contains(c2)) { 3273 continue; 3274 } 3275 3276 // Rule 13a 3277 if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) || 3278 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) && 3279 fExtendNumLetSet->contains(c2)) { 3280 continue; 3281 } 3282 3283 // Rule 13b 3284 if (fExtendNumLetSet->contains(c1) && 3285 (fALetterSet->contains(c2) || fNumericSet->contains(c2) || 3286 fKatakanaSet->contains(c2))) { 3287 continue; 3288 } 3289 3290 // Rule 14. Break found here. 3291 break; 3292 } 3293 3294 breakPos = p2; 3295 return breakPos; 3296 } 3297 3298 3299 UVector *RBBIWordMonkey::charClasses() { 3300 return fSets; 3301 } 3302 3303 3304 RBBIWordMonkey::~RBBIWordMonkey() { 3305 delete fSets; 3306 delete fCRSet; 3307 delete fLFSet; 3308 delete fNewlineSet; 3309 delete fKatakanaSet; 3310 delete fALetterSet; 3311 delete fMidNumLetSet; 3312 delete fMidLetterSet; 3313 delete fMidNumSet; 3314 delete fNumericSet; 3315 delete fFormatSet; 3316 delete fExtendSet; 3317 delete fExtendNumLetSet; 3318 delete fOtherSet; 3319 } 3320 3321 3322 3323 3324 //------------------------------------------------------------------------------------------ 3325 // 3326 // class RBBISentMonkey Sentence Break specific implementation 3327 // of RBBIMonkeyKind. 3328 // 3329 //------------------------------------------------------------------------------------------ 3330 class RBBISentMonkey: public RBBIMonkeyKind { 3331 public: 3332 RBBISentMonkey(); 3333 virtual ~RBBISentMonkey(); 3334 virtual UVector *charClasses(); 3335 virtual void setText(const UnicodeString &s); 3336 virtual int32_t next(int32_t i); 3337 private: 3338 int moveBack(int posFrom); 3339 int moveForward(int posFrom); 3340 UChar32 cAt(int pos); 3341 3342 UVector *fSets; 3343 3344 UnicodeSet *fSepSet; 3345 UnicodeSet *fFormatSet; 3346 UnicodeSet *fSpSet; 3347 UnicodeSet *fLowerSet; 3348 UnicodeSet *fUpperSet; 3349 UnicodeSet *fOLetterSet; 3350 UnicodeSet *fNumericSet; 3351 UnicodeSet *fATermSet; 3352 UnicodeSet *fSContinueSet; 3353 UnicodeSet *fSTermSet; 3354 UnicodeSet *fCloseSet; 3355 UnicodeSet *fOtherSet; 3356 UnicodeSet *fExtendSet; 3357 3358 const UnicodeString *fText; 3359 3360 }; 3361 3362 RBBISentMonkey::RBBISentMonkey() 3363 { 3364 UErrorCode status = U_ZERO_ERROR; 3365 3366 fSets = new UVector(status); 3367 3368 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator 3369 // set and made into character classes of their own. For the monkey impl, 3370 // they remain in SEP, since Sep always appears with CR and LF in the rules. 3371 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status); 3372 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status); 3373 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status); 3374 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status); 3375 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status); 3376 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status); 3377 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status); 3378 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status); 3379 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status); 3380 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status); 3381 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status); 3382 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status); 3383 fOtherSet = new UnicodeSet(); 3384 3385 if(U_FAILURE(status)) { 3386 deferredStatus = status; 3387 return; 3388 } 3389 3390 fOtherSet->complement(); 3391 fOtherSet->removeAll(*fSepSet); 3392 fOtherSet->removeAll(*fFormatSet); 3393 fOtherSet->removeAll(*fSpSet); 3394 fOtherSet->removeAll(*fLowerSet); 3395 fOtherSet->removeAll(*fUpperSet); 3396 fOtherSet->removeAll(*fOLetterSet); 3397 fOtherSet->removeAll(*fNumericSet); 3398 fOtherSet->removeAll(*fATermSet); 3399 fOtherSet->removeAll(*fSContinueSet); 3400 fOtherSet->removeAll(*fSTermSet); 3401 fOtherSet->removeAll(*fCloseSet); 3402 fOtherSet->removeAll(*fExtendSet); 3403 3404 fSets->addElement(fSepSet, status); 3405 fSets->addElement(fFormatSet, status); 3406 fSets->addElement(fSpSet, status); 3407 fSets->addElement(fLowerSet, status); 3408 fSets->addElement(fUpperSet, status); 3409 fSets->addElement(fOLetterSet, status); 3410 fSets->addElement(fNumericSet, status); 3411 fSets->addElement(fATermSet, status); 3412 fSets->addElement(fSContinueSet, status); 3413 fSets->addElement(fSTermSet, status); 3414 fSets->addElement(fCloseSet, status); 3415 fSets->addElement(fOtherSet, status); 3416 fSets->addElement(fExtendSet, status); 3417 3418 if (U_FAILURE(status)) { 3419 deferredStatus = status; 3420 } 3421 } 3422 3423 3424 3425 void RBBISentMonkey::setText(const UnicodeString &s) { 3426 fText = &s; 3427 } 3428 3429 UVector *RBBISentMonkey::charClasses() { 3430 return fSets; 3431 } 3432 3433 3434 // moveBack() Find the "significant" code point preceding the index i. 3435 // Skips over ($Extend | $Format)* . 3436 // 3437 int RBBISentMonkey::moveBack(int i) { 3438 if (i <= 0) { 3439 return -1; 3440 } 3441 UChar32 c; 3442 int32_t j = i; 3443 do { 3444 j = fText->moveIndex32(j, -1); 3445 c = fText->char32At(j); 3446 } 3447 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c))); 3448 return j; 3449 3450 } 3451 3452 3453 int RBBISentMonkey::moveForward(int i) { 3454 if (i>=fText->length()) { 3455 return fText->length(); 3456 } 3457 UChar32 c; 3458 int32_t j = i; 3459 do { 3460 j = fText->moveIndex32(j, 1); 3461 c = cAt(j); 3462 } 3463 while (fFormatSet->contains(c) || fExtendSet->contains(c)); 3464 return j; 3465 } 3466 3467 UChar32 RBBISentMonkey::cAt(int pos) { 3468 if (pos<0 || pos>=fText->length()) { 3469 return -1; 3470 } else { 3471 return fText->char32At(pos); 3472 } 3473 } 3474 3475 int32_t RBBISentMonkey::next(int32_t prevPos) { 3476 int p0, p1, p2, p3; // Indices of the significant code points around the 3477 // break position being tested. The candidate break 3478 // location is before p2. 3479 3480 int breakPos = -1; 3481 3482 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 3483 UChar32 c; 3484 3485 if (U_FAILURE(deferredStatus)) { 3486 return -1; 3487 } 3488 3489 // Prev break at end of string. return DONE. 3490 if (prevPos >= fText->length()) { 3491 return -1; 3492 } 3493 p0 = p1 = p2 = p3 = prevPos; 3494 c3 = fText->char32At(prevPos); 3495 c0 = c1 = c2 = 0; 3496 3497 // Loop runs once per "significant" character position in the input text. 3498 for (;;) { 3499 // Move all of the positions forward in the input string. 3500 p0 = p1; c0 = c1; 3501 p1 = p2; c1 = c2; 3502 p2 = p3; c2 = c3; 3503 3504 // Advancd p3 by X(Extend | Format)* Rule 4 3505 p3 = moveForward(p3); 3506 c3 = cAt(p3); 3507 3508 // Rule (3) CR x LF 3509 if (c1==0x0d && c2==0x0a && p2==(p1+1)) { 3510 continue; 3511 } 3512 3513 // Rule (4). Sep <break> 3514 if (fSepSet->contains(c1)) { 3515 p2 = p1+1; // Separators don't combine with Extend or Format. 3516 break; 3517 } 3518 3519 if (p2 >= fText->length()) { 3520 // Reached end of string. Always a break position. 3521 break; 3522 } 3523 3524 if (p2 == prevPos) { 3525 // Still warming up the loop. (won't work with zero length strings, but we don't care) 3526 continue; 3527 } 3528 3529 // Rule (6). ATerm x Numeric 3530 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) { 3531 continue; 3532 } 3533 3534 // Rule (7). Upper ATerm x Uppper 3535 if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) { 3536 continue; 3537 } 3538 3539 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower 3540 // Note: STerm | ATerm are added to the negated part of the expression by a 3541 // note to the Unicode 5.0 documents. 3542 int p8 = p1; 3543 while (fSpSet->contains(cAt(p8))) { 3544 p8 = moveBack(p8); 3545 } 3546 while (fCloseSet->contains(cAt(p8))) { 3547 p8 = moveBack(p8); 3548 } 3549 if (fATermSet->contains(cAt(p8))) { 3550 p8=p2; 3551 for (;;) { 3552 c = cAt(p8); 3553 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) || 3554 fLowerSet->contains(c) || fSepSet->contains(c) || 3555 fATermSet->contains(c) || fSTermSet->contains(c)) { 3556 break; 3557 } 3558 p8 = moveForward(p8); 3559 } 3560 if (fLowerSet->contains(cAt(p8))) { 3561 continue; 3562 } 3563 } 3564 3565 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm); 3566 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) { 3567 p8 = p1; 3568 while (fSpSet->contains(cAt(p8))) { 3569 p8 = moveBack(p8); 3570 } 3571 while (fCloseSet->contains(cAt(p8))) { 3572 p8 = moveBack(p8); 3573 } 3574 c = cAt(p8); 3575 if (fSTermSet->contains(c) || fATermSet->contains(c)) { 3576 continue; 3577 } 3578 } 3579 3580 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF) 3581 int p9 = p1; 3582 while (fCloseSet->contains(cAt(p9))) { 3583 p9 = moveBack(p9); 3584 } 3585 c = cAt(p9); 3586 if ((fSTermSet->contains(c) || fATermSet->contains(c))) { 3587 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) { 3588 continue; 3589 } 3590 } 3591 3592 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF) 3593 int p10 = p1; 3594 while (fSpSet->contains(cAt(p10))) { 3595 p10 = moveBack(p10); 3596 } 3597 while (fCloseSet->contains(cAt(p10))) { 3598 p10 = moveBack(p10); 3599 } 3600 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) { 3601 if (fSpSet->contains(c2) || fSepSet->contains(c2)) { 3602 continue; 3603 } 3604 } 3605 3606 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break> 3607 int p11 = p1; 3608 if (fSepSet->contains(cAt(p11))) { 3609 p11 = moveBack(p11); 3610 } 3611 while (fSpSet->contains(cAt(p11))) { 3612 p11 = moveBack(p11); 3613 } 3614 while (fCloseSet->contains(cAt(p11))) { 3615 p11 = moveBack(p11); 3616 } 3617 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) { 3618 break; 3619 } 3620 3621 // Rule (12) Any x Any 3622 continue; 3623 } 3624 breakPos = p2; 3625 return breakPos; 3626 } 3627 3628 RBBISentMonkey::~RBBISentMonkey() { 3629 delete fSets; 3630 delete fSepSet; 3631 delete fFormatSet; 3632 delete fSpSet; 3633 delete fLowerSet; 3634 delete fUpperSet; 3635 delete fOLetterSet; 3636 delete fNumericSet; 3637 delete fATermSet; 3638 delete fSContinueSet; 3639 delete fSTermSet; 3640 delete fCloseSet; 3641 delete fOtherSet; 3642 delete fExtendSet; 3643 } 3644 3645 3646 3647 //------------------------------------------------------------------------------------------- 3648 // 3649 // RBBILineMonkey 3650 // 3651 //------------------------------------------------------------------------------------------- 3652 3653 class RBBILineMonkey: public RBBIMonkeyKind { 3654 public: 3655 RBBILineMonkey(); 3656 virtual ~RBBILineMonkey(); 3657 virtual UVector *charClasses(); 3658 virtual void setText(const UnicodeString &s); 3659 virtual int32_t next(int32_t i); 3660 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar); 3661 private: 3662 UVector *fSets; 3663 3664 UnicodeSet *fBK; 3665 UnicodeSet *fCR; 3666 UnicodeSet *fLF; 3667 UnicodeSet *fCM; 3668 UnicodeSet *fNL; 3669 UnicodeSet *fSG; 3670 UnicodeSet *fWJ; 3671 UnicodeSet *fZW; 3672 UnicodeSet *fGL; 3673 UnicodeSet *fCB; 3674 UnicodeSet *fSP; 3675 UnicodeSet *fB2; 3676 UnicodeSet *fBA; 3677 UnicodeSet *fBB; 3678 UnicodeSet *fHY; 3679 UnicodeSet *fH2; 3680 UnicodeSet *fH3; 3681 UnicodeSet *fCL; 3682 UnicodeSet *fCP; 3683 UnicodeSet *fEX; 3684 UnicodeSet *fIN; 3685 UnicodeSet *fJL; 3686 UnicodeSet *fJV; 3687 UnicodeSet *fJT; 3688 UnicodeSet *fNS; 3689 UnicodeSet *fOP; 3690 UnicodeSet *fQU; 3691 UnicodeSet *fIS; 3692 UnicodeSet *fNU; 3693 UnicodeSet *fPO; 3694 UnicodeSet *fPR; 3695 UnicodeSet *fSY; 3696 UnicodeSet *fAI; 3697 UnicodeSet *fAL; 3698 UnicodeSet *fID; 3699 UnicodeSet *fSA; 3700 UnicodeSet *fXX; 3701 3702 BreakIterator *fCharBI; 3703 3704 const UnicodeString *fText; 3705 int32_t *fOrigPositions; 3706 3707 RegexMatcher *fNumberMatcher; 3708 RegexMatcher *fLB11Matcher; 3709 }; 3710 3711 3712 RBBILineMonkey::RBBILineMonkey() 3713 { 3714 UErrorCode status = U_ZERO_ERROR; 3715 3716 fSets = new UVector(status); 3717 3718 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status); 3719 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status); 3720 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status); 3721 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status); 3722 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status); 3723 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status); 3724 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status); 3725 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status); 3726 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status); 3727 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status); 3728 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status); 3729 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status); 3730 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status); 3731 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status); 3732 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status); 3733 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status); 3734 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status); 3735 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status); 3736 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status); 3737 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status); 3738 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status); 3739 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status); 3740 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status); 3741 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status); 3742 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status); 3743 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status); 3744 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status); 3745 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status); 3746 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status); 3747 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status); 3748 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status); 3749 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status); 3750 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status); 3751 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status); 3752 fSA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status); 3753 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status); 3754 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status); 3755 3756 if (U_FAILURE(status)) { 3757 deferredStatus = status; 3758 fCharBI = NULL; 3759 fNumberMatcher = NULL; 3760 return; 3761 } 3762 3763 fAL->addAll(*fXX); // Default behavior for XX is identical to AL 3764 fAL->addAll(*fAI); // Default behavior for AI is identical to AL 3765 fAL->addAll(*fSA); // Default behavior for SA is XX, which defaults to AL 3766 fAL->addAll(*fSG); // Default behavior for SG is identical to AL. 3767 3768 fSets->addElement(fBK, status); 3769 fSets->addElement(fCR, status); 3770 fSets->addElement(fLF, status); 3771 fSets->addElement(fCM, status); 3772 fSets->addElement(fNL, status); 3773 fSets->addElement(fWJ, status); 3774 fSets->addElement(fZW, status); 3775 fSets->addElement(fGL, status); 3776 fSets->addElement(fCB, status); 3777 fSets->addElement(fSP, status); 3778 fSets->addElement(fB2, status); 3779 fSets->addElement(fBA, status); 3780 fSets->addElement(fBB, status); 3781 fSets->addElement(fHY, status); 3782 fSets->addElement(fH2, status); 3783 fSets->addElement(fH3, status); 3784 fSets->addElement(fCL, status); 3785 fSets->addElement(fCP, status); 3786 fSets->addElement(fEX, status); 3787 fSets->addElement(fIN, status); 3788 fSets->addElement(fJL, status); 3789 fSets->addElement(fJT, status); 3790 fSets->addElement(fJV, status); 3791 fSets->addElement(fNS, status); 3792 fSets->addElement(fOP, status); 3793 fSets->addElement(fQU, status); 3794 fSets->addElement(fIS, status); 3795 fSets->addElement(fNU, status); 3796 fSets->addElement(fPO, status); 3797 fSets->addElement(fPR, status); 3798 fSets->addElement(fSY, status); 3799 fSets->addElement(fAI, status); 3800 fSets->addElement(fAL, status); 3801 fSets->addElement(fID, status); 3802 fSets->addElement(fWJ, status); 3803 fSets->addElement(fSA, status); 3804 fSets->addElement(fSG, status); 3805 3806 const char *rules = 3807 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?" 3808 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?" 3809 "\\p{Line_Break=NU}\\p{Line_Break=CM}*" 3810 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*" 3811 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?" 3812 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"; 3813 3814 fNumberMatcher = new RegexMatcher( 3815 UnicodeString(rules, -1, US_INV), 0, status); 3816 3817 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status); 3818 3819 if (U_FAILURE(status)) { 3820 deferredStatus = status; 3821 } 3822 } 3823 3824 3825 void RBBILineMonkey::setText(const UnicodeString &s) { 3826 fText = &s; 3827 fCharBI->setText(s); 3828 fNumberMatcher->reset(s); 3829 } 3830 3831 // 3832 // rule9Adjust 3833 // Line Break TR rules 9 and 10 implementation. 3834 // This deals with combining marks and other sequences that 3835 // that must be treated as if they were something other than what they actually are. 3836 // 3837 // This is factored out into a separate function because it must be applied twice for 3838 // each potential break, once to the chars before the position being checked, then 3839 // again to the text following the possible break. 3840 // 3841 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) { 3842 if (pos == -1) { 3843 // Invalid initial position. Happens during the warmup iteration of the 3844 // main loop in next(). 3845 return; 3846 } 3847 3848 int32_t nPos = *nextPos; 3849 3850 // LB 9 Keep combining sequences together. 3851 // advance over any CM class chars. Note that Line Break CM is different 3852 // from the normal Grapheme Extend property. 3853 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d || 3854 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) { 3855 for (;;) { 3856 *nextChar = fText->char32At(nPos); 3857 if (!fCM->contains(*nextChar)) { 3858 break; 3859 } 3860 nPos = fText->moveIndex32(nPos, 1); 3861 } 3862 } 3863 3864 3865 // LB 9 Treat X CM* as if it were x. 3866 // No explicit action required. 3867 3868 // LB 10 Treat any remaining combining mark as AL 3869 if (fCM->contains(*posChar)) { 3870 *posChar = 0x41; // thisChar = 'A'; 3871 } 3872 3873 // Push the updated nextPos and nextChar back to our caller. 3874 // This only makes a difference if posChar got bigger by consuming a 3875 // combining sequence. 3876 *nextPos = nPos; 3877 *nextChar = fText->char32At(nPos); 3878 } 3879 3880 3881 3882 int32_t RBBILineMonkey::next(int32_t startPos) { 3883 UErrorCode status = U_ZERO_ERROR; 3884 int32_t pos; // Index of the char following a potential break position 3885 UChar32 thisChar; // Character at above position "pos" 3886 3887 int32_t prevPos; // Index of the char preceding a potential break position 3888 UChar32 prevChar; // Character at above position. Note that prevChar 3889 // and thisChar may not be adjacent because combining 3890 // characters between them will be ignored. 3891 3892 int32_t nextPos; // Index of the next character following pos. 3893 // Usually skips over combining marks. 3894 int32_t nextCPPos; // Index of the code point following "pos." 3895 // May point to a combining mark. 3896 int32_t tPos; // temp value. 3897 UChar32 c; 3898 3899 if (U_FAILURE(deferredStatus)) { 3900 return -1; 3901 } 3902 3903 if (startPos >= fText->length()) { 3904 return -1; 3905 } 3906 3907 3908 // Initial values for loop. Loop will run the first time without finding breaks, 3909 // while the invalid values shift out and the "this" and 3910 // "prev" positions are filled in with good values. 3911 pos = prevPos = -1; // Invalid value, serves as flag for initial loop iteration. 3912 thisChar = prevChar = 0; 3913 nextPos = nextCPPos = startPos; 3914 3915 3916 // Loop runs once per position in the test text, until a break position 3917 // is found. 3918 for (;;) { 3919 prevPos = pos; 3920 prevChar = thisChar; 3921 3922 pos = nextPos; 3923 thisChar = fText->char32At(pos); 3924 3925 nextCPPos = fText->moveIndex32(pos, 1); 3926 nextPos = nextCPPos; 3927 3928 // Rule LB2 - Break at end of text. 3929 if (pos >= fText->length()) { 3930 break; 3931 } 3932 3933 // Rule LB 9 - adjust for combining sequences. 3934 // We do this one out-of-order because the adjustment does not change anything 3935 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to 3936 // be applied. 3937 rule9Adjust(prevPos, &prevChar, &pos, &thisChar); 3938 nextCPPos = nextPos = fText->moveIndex32(pos, 1); 3939 c = fText->char32At(nextPos); 3940 rule9Adjust(pos, &thisChar, &nextPos, &c); 3941 3942 // If the loop is still warming up - if we haven't shifted the initial 3943 // -1 positions out of prevPos yet - loop back to advance the 3944 // position in the input without any further looking for breaks. 3945 if (prevPos == -1) { 3946 continue; 3947 } 3948 3949 // LB 4 Always break after hard line breaks, 3950 if (fBK->contains(prevChar)) { 3951 break; 3952 } 3953 3954 // LB 5 Break after CR, LF, NL, but not inside CR LF 3955 if (prevChar == 0x0d && thisChar == 0x0a) { 3956 continue; 3957 } 3958 if (prevChar == 0x0d || 3959 prevChar == 0x0a || 3960 prevChar == 0x85) { 3961 break; 3962 } 3963 3964 // LB 6 Don't break before hard line breaks 3965 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 || 3966 fBK->contains(thisChar)) { 3967 continue; 3968 } 3969 3970 3971 // LB 7 Don't break before spaces or zero-width space. 3972 if (fSP->contains(thisChar)) { 3973 continue; 3974 } 3975 3976 if (fZW->contains(thisChar)) { 3977 continue; 3978 } 3979 3980 // LB 8 Break after zero width space 3981 if (fZW->contains(prevChar)) { 3982 break; 3983 } 3984 3985 // LB 9, 10 Already done, at top of loop. 3986 // 3987 3988 3989 // LB 11 Do not break before or after WORD JOINER and related characters. 3990 // x WJ 3991 // WJ x 3992 // 3993 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) { 3994 continue; 3995 } 3996 3997 // LB 12 3998 // GL x 3999 if (fGL->contains(prevChar)) { 4000 continue; 4001 } 4002 4003 // LB 12a 4004 // [^SP BA HY] x GL 4005 if (!(fSP->contains(prevChar) || 4006 fBA->contains(prevChar) || 4007 fHY->contains(prevChar) ) && fGL->contains(thisChar)) { 4008 continue; 4009 } 4010 4011 4012 4013 // LB 13 Don't break before closings. 4014 // NU x CL, NU x CP and NU x IS are not matched here so that they will 4015 // fall into LB 17 and the more general number regular expression. 4016 // 4017 if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) || 4018 (!fNU->contains(prevChar) && fCP->contains(thisChar)) || 4019 fEX->contains(thisChar) || 4020 (!fNU->contains(prevChar) && fIS->contains(thisChar)) || 4021 (!fNU->contains(prevChar) && fSY->contains(thisChar))) { 4022 continue; 4023 } 4024 4025 // LB 14 Don't break after OP SP* 4026 // Scan backwards, checking for this sequence. 4027 // The OP char could include combining marks, so we actually check for 4028 // OP CM* SP* 4029 // Another Twist: The Rule 67 fixes may have changed a SP CM 4030 // sequence into a ID char, so before scanning back through spaces, 4031 // verify that prevChar is indeed a space. The prevChar variable 4032 // may differ from fText[prevPos] 4033 tPos = prevPos; 4034 if (fSP->contains(prevChar)) { 4035 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) { 4036 tPos=fText->moveIndex32(tPos, -1); 4037 } 4038 } 4039 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { 4040 tPos=fText->moveIndex32(tPos, -1); 4041 } 4042 if (fOP->contains(fText->char32At(tPos))) { 4043 continue; 4044 } 4045 4046 4047 // LB 15 QU SP* x OP 4048 if (fOP->contains(thisChar)) { 4049 // Scan backwards from prevChar to see if it is preceded by QU CM* SP* 4050 int tPos = prevPos; 4051 while (tPos>0 && fSP->contains(fText->char32At(tPos))) { 4052 tPos = fText->moveIndex32(tPos, -1); 4053 } 4054 while (tPos>0 && fCM->contains(fText->char32At(tPos))) { 4055 tPos = fText->moveIndex32(tPos, -1); 4056 } 4057 if (fQU->contains(fText->char32At(tPos))) { 4058 continue; 4059 } 4060 } 4061 4062 4063 4064 // LB 16 (CL | CP) SP* x NS 4065 // Scan backwards for SP* CM* (CL | CP) 4066 if (fNS->contains(thisChar)) { 4067 int tPos = prevPos; 4068 while (tPos>0 && fSP->contains(fText->char32At(tPos))) { 4069 tPos = fText->moveIndex32(tPos, -1); 4070 } 4071 while (tPos>0 && fCM->contains(fText->char32At(tPos))) { 4072 tPos = fText->moveIndex32(tPos, -1); 4073 } 4074 if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) { 4075 continue; 4076 } 4077 } 4078 4079 4080 // LB 17 B2 SP* x B2 4081 if (fB2->contains(thisChar)) { 4082 // Scan backwards, checking for the B2 CM* SP* sequence. 4083 tPos = prevPos; 4084 if (fSP->contains(prevChar)) { 4085 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) { 4086 tPos=fText->moveIndex32(tPos, -1); 4087 } 4088 } 4089 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { 4090 tPos=fText->moveIndex32(tPos, -1); 4091 } 4092 if (fB2->contains(fText->char32At(tPos))) { 4093 continue; 4094 } 4095 } 4096 4097 4098 // LB 18 break after space 4099 if (fSP->contains(prevChar)) { 4100 break; 4101 } 4102 4103 // LB 19 4104 // x QU 4105 // QU x 4106 if (fQU->contains(thisChar) || fQU->contains(prevChar)) { 4107 continue; 4108 } 4109 4110 // LB 20 Break around a CB 4111 if (fCB->contains(thisChar) || fCB->contains(prevChar)) { 4112 break; 4113 } 4114 4115 // LB 21 4116 if (fBA->contains(thisChar) || 4117 fHY->contains(thisChar) || 4118 fNS->contains(thisChar) || 4119 fBB->contains(prevChar) ) { 4120 continue; 4121 } 4122 4123 // LB 22 4124 if ((fAL->contains(prevChar) && fIN->contains(thisChar)) || 4125 (fID->contains(prevChar) && fIN->contains(thisChar)) || 4126 (fIN->contains(prevChar) && fIN->contains(thisChar)) || 4127 (fNU->contains(prevChar) && fIN->contains(thisChar)) ) { 4128 continue; 4129 } 4130 4131 4132 // LB 23 ID x PO 4133 // AL x NU 4134 // NU x AL 4135 if ((fID->contains(prevChar) && fPO->contains(thisChar)) || 4136 (fAL->contains(prevChar) && fNU->contains(thisChar)) || 4137 (fNU->contains(prevChar) && fAL->contains(thisChar)) ) { 4138 continue; 4139 } 4140 4141 // LB 24 Do not break between prefix and letters or ideographs. 4142 // PR x ID 4143 // PR x AL 4144 // PO x AL 4145 if ((fPR->contains(prevChar) && fID->contains(thisChar)) || 4146 (fPR->contains(prevChar) && fAL->contains(thisChar)) || 4147 (fPO->contains(prevChar) && fAL->contains(thisChar)) ) { 4148 continue; 4149 } 4150 4151 4152 4153 // LB 25 Numbers 4154 if (fNumberMatcher->lookingAt(prevPos, status)) { 4155 if (U_FAILURE(status)) { 4156 break; 4157 } 4158 // Matched a number. But could have been just a single digit, which would 4159 // not represent a "no break here" between prevChar and thisChar 4160 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num 4161 if (numEndIdx > pos) { 4162 // Number match includes at least our two chars being checked 4163 if (numEndIdx > nextPos) { 4164 // Number match includes additional chars. Update pos and nextPos 4165 // so that next loop iteration will continue at the end of the number, 4166 // checking for breaks between last char in number & whatever follows. 4167 pos = nextPos = numEndIdx; 4168 do { 4169 pos = fText->moveIndex32(pos, -1); 4170 thisChar = fText->char32At(pos); 4171 } while (fCM->contains(thisChar)); 4172 } 4173 continue; 4174 } 4175 } 4176 4177 4178 // LB 26 Do not break a Korean syllable. 4179 if (fJL->contains(prevChar) && (fJL->contains(thisChar) || 4180 fJV->contains(thisChar) || 4181 fH2->contains(thisChar) || 4182 fH3->contains(thisChar))) { 4183 continue; 4184 } 4185 4186 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) && 4187 (fJV->contains(thisChar) || fJT->contains(thisChar))) { 4188 continue; 4189 } 4190 4191 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) && 4192 fJT->contains(thisChar)) { 4193 continue; 4194 } 4195 4196 // LB 27 Treat a Korean Syllable Block the same as ID. 4197 if ((fJL->contains(prevChar) || fJV->contains(prevChar) || 4198 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) && 4199 fIN->contains(thisChar)) { 4200 continue; 4201 } 4202 if ((fJL->contains(prevChar) || fJV->contains(prevChar) || 4203 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) && 4204 fPO->contains(thisChar)) { 4205 continue; 4206 } 4207 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) || 4208 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) { 4209 continue; 4210 } 4211 4212 4213 4214 // LB 28 Do not break between alphabetics ("at"). 4215 if (fAL->contains(prevChar) && fAL->contains(thisChar)) { 4216 continue; 4217 } 4218 4219 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g."). 4220 if (fIS->contains(prevChar) && fAL->contains(thisChar)) { 4221 continue; 4222 } 4223 4224 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation. 4225 // (AL | NU) x OP 4226 // CP x (AL | NU) 4227 if ((fAL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) { 4228 continue; 4229 } 4230 if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fNU->contains(thisChar))) { 4231 continue; 4232 } 4233 4234 // LB 31 Break everywhere else 4235 break; 4236 4237 } 4238 4239 return pos; 4240 } 4241 4242 4243 UVector *RBBILineMonkey::charClasses() { 4244 return fSets; 4245 } 4246 4247 4248 RBBILineMonkey::~RBBILineMonkey() { 4249 delete fSets; 4250 4251 delete fBK; 4252 delete fCR; 4253 delete fLF; 4254 delete fCM; 4255 delete fNL; 4256 delete fWJ; 4257 delete fZW; 4258 delete fGL; 4259 delete fCB; 4260 delete fSP; 4261 delete fB2; 4262 delete fBA; 4263 delete fBB; 4264 delete fHY; 4265 delete fH2; 4266 delete fH3; 4267 delete fCL; 4268 delete fCP; 4269 delete fEX; 4270 delete fIN; 4271 delete fJL; 4272 delete fJV; 4273 delete fJT; 4274 delete fNS; 4275 delete fOP; 4276 delete fQU; 4277 delete fIS; 4278 delete fNU; 4279 delete fPO; 4280 delete fPR; 4281 delete fSY; 4282 delete fAI; 4283 delete fAL; 4284 delete fID; 4285 delete fSA; 4286 delete fSG; 4287 delete fXX; 4288 4289 delete fCharBI; 4290 delete fNumberMatcher; 4291 } 4292 4293 4294 //------------------------------------------------------------------------------------------- 4295 // 4296 // TestMonkey 4297 // 4298 // params 4299 // seed=nnnnn Random number starting seed. 4300 // Setting the seed allows errors to be reproduced. 4301 // loop=nnn Looping count. Controls running time. 4302 // -1: run forever. 4303 // 0 or greater: run length. 4304 // 4305 // type = char | word | line | sent | title 4306 // 4307 //------------------------------------------------------------------------------------------- 4308 4309 static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) { 4310 int32_t val = defaultVal; 4311 name.append(" *= *(-?\\d+)"); 4312 UErrorCode status = U_ZERO_ERROR; 4313 RegexMatcher m(name, params, 0, status); 4314 if (m.find()) { 4315 // The param exists. Convert the string to an int. 4316 char valString[100]; 4317 int32_t paramLength = m.end(1, status) - m.start(1, status); 4318 if (paramLength >= (int32_t)(sizeof(valString)-1)) { 4319 paramLength = (int32_t)(sizeof(valString)-2); 4320 } 4321 params.extract(m.start(1, status), paramLength, valString, sizeof(valString)); 4322 val = strtol(valString, NULL, 10); 4323 4324 // Delete this parameter from the params string. 4325 m.reset(); 4326 params = m.replaceFirst("", status); 4327 } 4328 U_ASSERT(U_SUCCESS(status)); 4329 return val; 4330 } 4331 #endif 4332 4333 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr, 4334 BreakIterator *bi, 4335 int expected[], 4336 int expectedcount) 4337 { 4338 int count = 0; 4339 int i = 0; 4340 int forward[50]; 4341 bi->setText(ustr); 4342 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) { 4343 forward[count] = i; 4344 if (count < expectedcount && expected[count] != i) { 4345 test->errln("break forward test failed: expected %d but got %d", 4346 expected[count], i); 4347 break; 4348 } 4349 count ++; 4350 } 4351 if (count != expectedcount) { 4352 printStringBreaks(ustr, expected, expectedcount); 4353 test->errln("break forward test failed: missed %d match", 4354 expectedcount - count); 4355 return; 4356 } 4357 // testing boundaries 4358 for (i = 1; i < expectedcount; i ++) { 4359 int j = expected[i - 1]; 4360 if (!bi->isBoundary(j)) { 4361 printStringBreaks(ustr, expected, expectedcount); 4362 test->errln("isBoundary() failed. Expected boundary at position %d", j); 4363 return; 4364 } 4365 for (j = expected[i - 1] + 1; j < expected[i]; j ++) { 4366 if (bi->isBoundary(j)) { 4367 printStringBreaks(ustr, expected, expectedcount); 4368 test->errln("isBoundary() failed. Not expecting boundary at position %d", j); 4369 return; 4370 } 4371 } 4372 } 4373 4374 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) { 4375 count --; 4376 if (forward[count] != i) { 4377 printStringBreaks(ustr, expected, expectedcount); 4378 test->errln("happy break test previous() failed: expected %d but got %d", 4379 forward[count], i); 4380 break; 4381 } 4382 } 4383 if (count != 0) { 4384 printStringBreaks(ustr, expected, expectedcount); 4385 test->errln("break test previous() failed: missed a match"); 4386 return; 4387 } 4388 4389 // testing preceding 4390 for (i = 0; i < expectedcount - 1; i ++) { 4391 // int j = expected[i] + 1; 4392 int j = ustr.moveIndex32(expected[i], 1); 4393 for (; j <= expected[i + 1]; j ++) { 4394 if (bi->preceding(j) != expected[i]) { 4395 printStringBreaks(ustr, expected, expectedcount); 4396 test->errln("preceding(): Not expecting boundary at position %d", j); 4397 return; 4398 } 4399 } 4400 } 4401 } 4402 4403 void RBBITest::TestWordBreaks(void) 4404 { 4405 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 4406 4407 Locale locale("en"); 4408 UErrorCode status = U_ZERO_ERROR; 4409 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 4410 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 4411 // Replaced any C+J characters in a row with a random sequence of characters 4412 // of the same length to make our C+J segmentation not get in the way. 4413 static const char *strlist[] = 4414 { 4415 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d", 4416 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b", 4417 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a", 4418 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622", 4419 "\\uac00\\u3588\\u009c\\u0953\\u194b", 4420 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 4421 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e", 4422 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e", 4423 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 4424 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 4425 "\\u2027\\U000e0067\\u0a47\\u00b7", 4426 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 4427 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 4428 "\\u0589\\U000e006e\\u0a42\\U000104a5", 4429 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a", 4430 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 4431 "\\u0027\\u11af\\U000e0057\\u0602", 4432 "\\U0001d7f2\\U000e007\\u0004\\u0589", 4433 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b", 4434 "\\U0001d7f2\\U000e007d\\u0004\\u0589", 4435 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 4436 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 4437 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068", 4438 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 4439 "\\u0233\\U000e0020\\u0a69\\u0d6a", 4440 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 4441 "\\u18f4\\U000e0049\\u20e7\\u2027", 4442 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 4443 "\\ua183\\u102d\\u0bec\\u003a", 4444 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 4445 "\\u003a\\u0e57\\u0fad\\u002e", 4446 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", 4447 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", 4448 "\\U000e005d\\u2044\\u0731\\u0650\\u0061", 4449 "\\u003a\\u0664\\u00b7\\u1fba", 4450 "\\u003b\\u0027\\u00b7\\u47a3", 4451 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b", 4452 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673", 4453 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c", 4454 }; 4455 int loop; 4456 if (U_FAILURE(status)) { 4457 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 4458 return; 4459 } 4460 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 4461 // printf("looping %d\n", loop); 4462 UnicodeString ustr = CharsToUnicodeString(strlist[loop]); 4463 // RBBICharMonkey monkey; 4464 RBBIWordMonkey monkey; 4465 4466 int expected[50]; 4467 int expectedcount = 0; 4468 4469 monkey.setText(ustr); 4470 int i; 4471 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 4472 expected[expectedcount ++] = i; 4473 } 4474 4475 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 4476 } 4477 delete bi; 4478 #endif 4479 } 4480 4481 void RBBITest::TestWordBoundary(void) 4482 { 4483 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data> 4484 Locale locale("en"); 4485 UErrorCode status = U_ZERO_ERROR; 4486 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 4487 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 4488 UChar str[50]; 4489 static const char *strlist[] = 4490 { 4491 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 4492 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 4493 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 4494 "\\u2027\\U000e0067\\u0a47\\u00b7", 4495 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 4496 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 4497 "\\u0589\\U000e006e\\u0a42\\U000104a5", 4498 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a", 4499 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 4500 "\\u0027\\u11af\\U000e0057\\u0602", 4501 "\\U0001d7f2\\U000e007\\u0004\\u0589", 4502 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b", 4503 "\\U0001d7f2\\U000e007d\\u0004\\u0589", 4504 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 4505 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 4506 "\\U000e0065\\u302c\\u09ee\\U000e0068", 4507 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 4508 "\\u0233\\U000e0020\\u0a69\\u0d6a", 4509 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 4510 "\\u58f4\\U000e0049\\u20e7\\u2027", 4511 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 4512 "\\ua183\\u102d\\u0bec\\u003a", 4513 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 4514 "\\u003a\\u0e57\\u0fad\\u002e", 4515 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", 4516 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", 4517 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019", 4518 "\\u003a\\u0664\\u00b7\\u1fba", 4519 "\\u003b\\u0027\\u00b7\\u47a3", 4520 }; 4521 int loop; 4522 if (U_FAILURE(status)) { 4523 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 4524 return; 4525 } 4526 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 4527 // printf("looping %d\n", loop); 4528 u_unescape(strlist[loop], str, 20); 4529 UnicodeString ustr(str); 4530 int forward[50]; 4531 int count = 0; 4532 4533 bi->setText(ustr); 4534 int prev = 0; 4535 int i; 4536 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) { 4537 forward[count ++] = i; 4538 if (i > prev) { 4539 int j; 4540 for (j = prev + 1; j < i; j ++) { 4541 if (bi->isBoundary(j)) { 4542 printStringBreaks(ustr, forward, count); 4543 errln("happy boundary test failed: expected %d not a boundary", 4544 j); 4545 return; 4546 } 4547 } 4548 } 4549 if (!bi->isBoundary(i)) { 4550 printStringBreaks(ustr, forward, count); 4551 errln("happy boundary test failed: expected %d a boundary", 4552 i); 4553 return; 4554 } 4555 prev = i; 4556 } 4557 } 4558 delete bi; 4559 } 4560 4561 void RBBITest::TestLineBreaks(void) 4562 { 4563 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 4564 Locale locale("en"); 4565 UErrorCode status = U_ZERO_ERROR; 4566 BreakIterator *bi = BreakIterator::createLineInstance(locale, status); 4567 const int32_t STRSIZE = 50; 4568 UChar str[STRSIZE]; 4569 static const char *strlist[] = 4570 { 4571 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc", 4572 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\" 4573 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d", 4574 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\" 4575 "u2014\\U000e0105\\u118c\\u000a\\u07f8", 4576 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f", 4577 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", 4578 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4", 4579 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", 4580 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060", 4581 "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5", 4582 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f", 4583 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1", 4584 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5", 4585 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0", 4586 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc", 4587 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f", 4588 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f", 4589 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b", 4590 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085", 4591 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac", 4592 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9", 4593 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025", 4594 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763", 4595 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029", 4596 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7", 4597 "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc", 4598 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a", 4599 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945", 4600 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014", 4601 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b", 4602 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0", 4603 "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025", 4604 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d", 4605 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111", 4606 "\\u2014\\u0020\\u000a\\u17c5\\u24fc", 4607 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f", 4608 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010", 4609 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43", 4610 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb", 4611 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc", 4612 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060", 4613 "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d" 4614 "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5" 4615 "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b", 4616 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0", 4617 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07", 4618 }; 4619 int loop; 4620 TEST_ASSERT_SUCCESS(status); 4621 if (U_FAILURE(status)) { 4622 return; 4623 } 4624 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 4625 // printf("looping %d\n", loop); 4626 int32_t t = u_unescape(strlist[loop], str, STRSIZE); 4627 if (t >= STRSIZE) { 4628 TEST_ASSERT(FALSE); 4629 continue; 4630 } 4631 4632 4633 UnicodeString ustr(str); 4634 RBBILineMonkey monkey; 4635 if (U_FAILURE(monkey.deferredStatus)) { 4636 continue; 4637 } 4638 4639 const int EXPECTEDSIZE = 50; 4640 int expected[EXPECTEDSIZE]; 4641 int expectedcount = 0; 4642 4643 monkey.setText(ustr); 4644 int i; 4645 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 4646 if (expectedcount >= EXPECTEDSIZE) { 4647 TEST_ASSERT(expectedcount < EXPECTEDSIZE); 4648 return; 4649 } 4650 expected[expectedcount ++] = i; 4651 } 4652 4653 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 4654 } 4655 delete bi; 4656 #endif 4657 } 4658 4659 void RBBITest::TestSentBreaks(void) 4660 { 4661 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 4662 Locale locale("en"); 4663 UErrorCode status = U_ZERO_ERROR; 4664 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status); 4665 UChar str[200]; 4666 static const char *strlist[] = 4667 { 4668 "Now\ris\nthe\r\ntime\n\rfor\r\r", 4669 "This\n", 4670 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.", 4671 "\"Sentence ending with a quote.\" Bye.", 4672 " (This is it). Testing the sentence iterator. \"This isn't it.\"", 4673 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"", 4674 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ", 4675 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ", 4676 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ", 4677 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!", 4678 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52" 4679 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a" 4680 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f" 4681 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352", 4682 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171" 4683 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030" 4684 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b" 4685 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b" 4686 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05" 4687 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4" 4688 }; 4689 int loop; 4690 if (U_FAILURE(status)) { 4691 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 4692 return; 4693 } 4694 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 4695 u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0]))); 4696 UnicodeString ustr(str); 4697 4698 RBBISentMonkey monkey; 4699 if (U_FAILURE(monkey.deferredStatus)) { 4700 continue; 4701 } 4702 4703 const int EXPECTEDSIZE = 50; 4704 int expected[EXPECTEDSIZE]; 4705 int expectedcount = 0; 4706 4707 monkey.setText(ustr); 4708 int i; 4709 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 4710 if (expectedcount >= EXPECTEDSIZE) { 4711 TEST_ASSERT(expectedcount < EXPECTEDSIZE); 4712 return; 4713 } 4714 expected[expectedcount ++] = i; 4715 } 4716 4717 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 4718 } 4719 delete bi; 4720 #endif 4721 } 4722 4723 void RBBITest::TestMonkey(char *params) { 4724 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 4725 4726 UErrorCode status = U_ZERO_ERROR; 4727 int32_t loopCount = 500; 4728 int32_t seed = 1; 4729 UnicodeString breakType = "all"; 4730 Locale locale("en"); 4731 UBool useUText = FALSE; 4732 4733 if (quick == FALSE) { 4734 loopCount = 10000; 4735 } 4736 4737 if (params) { 4738 UnicodeString p(params); 4739 loopCount = getIntParam("loop", p, loopCount); 4740 seed = getIntParam("seed", p, seed); 4741 4742 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status); 4743 if (m.find()) { 4744 breakType = m.group(1, status); 4745 m.reset(); 4746 p = m.replaceFirst("", status); 4747 } 4748 4749 RegexMatcher u(" *utext", p, 0, status); 4750 if (u.find()) { 4751 useUText = TRUE; 4752 u.reset(); 4753 p = u.replaceFirst("", status); 4754 } 4755 4756 4757 // m.reset(p); 4758 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) { 4759 // Each option is stripped out of the option string as it is processed. 4760 // All options have been checked. The option string should have been completely emptied.. 4761 char buf[100]; 4762 p.extract(buf, sizeof(buf), NULL, status); 4763 buf[sizeof(buf)-1] = 0; 4764 errln("Unrecognized or extra parameter: %s\n", buf); 4765 return; 4766 } 4767 4768 } 4769 4770 if (breakType == "char" || breakType == "all") { 4771 RBBICharMonkey m; 4772 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 4773 if (U_SUCCESS(status)) { 4774 RunMonkey(bi, m, "char", seed, loopCount, useUText); 4775 if (breakType == "all" && useUText==FALSE) { 4776 // Also run a quick test with UText when "all" is specified 4777 RunMonkey(bi, m, "char", seed, loopCount, TRUE); 4778 } 4779 } 4780 else { 4781 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status)); 4782 } 4783 delete bi; 4784 } 4785 4786 if (breakType == "word" || breakType == "all") { 4787 logln("Word Break Monkey Test"); 4788 RBBIWordMonkey m; 4789 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 4790 if (U_SUCCESS(status)) { 4791 RunMonkey(bi, m, "word", seed, loopCount, useUText); 4792 } 4793 else { 4794 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status)); 4795 } 4796 delete bi; 4797 } 4798 4799 if (breakType == "line" || breakType == "all") { 4800 logln("Line Break Monkey Test"); 4801 RBBILineMonkey m; 4802 BreakIterator *bi = BreakIterator::createLineInstance(locale, status); 4803 if (loopCount >= 10) { 4804 loopCount = loopCount / 5; // Line break runs slower than the others. 4805 } 4806 if (U_SUCCESS(status)) { 4807 RunMonkey(bi, m, "line", seed, loopCount, useUText); 4808 } 4809 else { 4810 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); 4811 } 4812 delete bi; 4813 } 4814 4815 if (breakType == "sent" || breakType == "all" ) { 4816 logln("Sentence Break Monkey Test"); 4817 RBBISentMonkey m; 4818 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status); 4819 if (loopCount >= 10) { 4820 loopCount = loopCount / 10; // Sentence runs slower than the other break types 4821 } 4822 if (U_SUCCESS(status)) { 4823 RunMonkey(bi, m, "sentence", seed, loopCount, useUText); 4824 } 4825 else { 4826 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); 4827 } 4828 delete bi; 4829 } 4830 4831 #endif 4832 } 4833 4834 // 4835 // Run a RBBI monkey test. Common routine, for all break iterator types. 4836 // Parameters: 4837 // bi - the break iterator to use 4838 // mk - MonkeyKind, abstraction for obtaining expected results 4839 // name - Name of test (char, word, etc.) for use in error messages 4840 // seed - Seed for starting random number generator (parameter from user) 4841 // numIterations 4842 // 4843 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed, 4844 int32_t numIterations, UBool useUText) { 4845 4846 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 4847 4848 const int32_t TESTSTRINGLEN = 500; 4849 UnicodeString testText; 4850 int32_t numCharClasses; 4851 UVector *chClasses; 4852 int expected[TESTSTRINGLEN*2 + 1]; 4853 int expectedCount = 0; 4854 char expectedBreaks[TESTSTRINGLEN*2 + 1]; 4855 char forwardBreaks[TESTSTRINGLEN*2 + 1]; 4856 char reverseBreaks[TESTSTRINGLEN*2+1]; 4857 char isBoundaryBreaks[TESTSTRINGLEN*2+1]; 4858 char followingBreaks[TESTSTRINGLEN*2+1]; 4859 char precedingBreaks[TESTSTRINGLEN*2+1]; 4860 int i; 4861 int loopCount = 0; 4862 4863 m_seed = seed; 4864 4865 numCharClasses = mk.charClasses()->size(); 4866 chClasses = mk.charClasses(); 4867 4868 // Check for errors that occured during the construction of the MonkeyKind object. 4869 // Can't report them where they occured because errln() is a method coming from intlTest, 4870 // and is not visible outside of RBBITest :-( 4871 if (U_FAILURE(mk.deferredStatus)) { 4872 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus)); 4873 return; 4874 } 4875 4876 // Verify that the character classes all have at least one member. 4877 for (i=0; i<numCharClasses; i++) { 4878 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i); 4879 if (s == NULL || s->size() == 0) { 4880 errln("Character Class #%d is null or of zero size.", i); 4881 return; 4882 } 4883 } 4884 4885 while (loopCount < numIterations || numIterations == -1) { 4886 if (numIterations == -1 && loopCount % 10 == 0) { 4887 // If test is running in an infinite loop, display a periodic tic so 4888 // we can tell that it is making progress. 4889 fprintf(stderr, "."); 4890 } 4891 // Save current random number seed, so that we can recreate the random numbers 4892 // for this loop iteration in event of an error. 4893 seed = m_seed; 4894 4895 // Populate a test string with data. 4896 testText.truncate(0); 4897 for (i=0; i<TESTSTRINGLEN; i++) { 4898 int32_t aClassNum = m_rand() % numCharClasses; 4899 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum); 4900 int32_t charIdx = m_rand() % classSet->size(); 4901 UChar32 c = classSet->charAt(charIdx); 4902 if (c < 0) { // TODO: deal with sets containing strings. 4903 errln("c < 0"); 4904 break; 4905 } 4906 testText.append(c); 4907 } 4908 4909 // Calculate the expected results for this test string. 4910 mk.setText(testText); 4911 memset(expectedBreaks, 0, sizeof(expectedBreaks)); 4912 expectedBreaks[0] = 1; 4913 int32_t breakPos = 0; 4914 expectedCount = 0; 4915 for (;;) { 4916 breakPos = mk.next(breakPos); 4917 if (breakPos == -1) { 4918 break; 4919 } 4920 if (breakPos > testText.length()) { 4921 errln("breakPos > testText.length()"); 4922 } 4923 expectedBreaks[breakPos] = 1; 4924 U_ASSERT(expectedCount<testText.length()); 4925 expected[expectedCount ++] = breakPos; 4926 } 4927 4928 // Find the break positions using forward iteration 4929 memset(forwardBreaks, 0, sizeof(forwardBreaks)); 4930 if (useUText) { 4931 UErrorCode status = U_ZERO_ERROR; 4932 UText *testUText = utext_openReplaceable(NULL, &testText, &status); 4933 // testUText = utext_openUnicodeString(testUText, &testText, &status); 4934 bi->setText(testUText, status); 4935 TEST_ASSERT_SUCCESS(status); 4936 utext_close(testUText); // The break iterator does a shallow clone of the UText 4937 // This UText can be closed immediately, so long as the 4938 // testText string continues to exist. 4939 } else { 4940 bi->setText(testText); 4941 } 4942 4943 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) { 4944 if (i < 0 || i > testText.length()) { 4945 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name); 4946 break; 4947 } 4948 forwardBreaks[i] = 1; 4949 } 4950 4951 // Find the break positions using reverse iteration 4952 memset(reverseBreaks, 0, sizeof(reverseBreaks)); 4953 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) { 4954 if (i < 0 || i > testText.length()) { 4955 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name); 4956 break; 4957 } 4958 reverseBreaks[i] = 1; 4959 } 4960 4961 // Find the break positions using isBoundary() tests. 4962 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks)); 4963 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length()); 4964 for (i=0; i<=testText.length(); i++) { 4965 isBoundaryBreaks[i] = bi->isBoundary(i); 4966 } 4967 4968 4969 // Find the break positions using the following() function. 4970 // printf("."); 4971 memset(followingBreaks, 0, sizeof(followingBreaks)); 4972 int32_t lastBreakPos = 0; 4973 followingBreaks[0] = 1; 4974 for (i=0; i<testText.length(); i++) { 4975 breakPos = bi->following(i); 4976 if (breakPos <= i || 4977 breakPos < lastBreakPos || 4978 breakPos > testText.length() || 4979 (breakPos > lastBreakPos && lastBreakPos > i)) { 4980 errln("%s break monkey test: " 4981 "Out of range value returned by BreakIterator::following().\n" 4982 "Random seed=%d index=%d; following returned %d; lastbreak=%d", 4983 name, seed, i, breakPos, lastBreakPos); 4984 break; 4985 } 4986 followingBreaks[breakPos] = 1; 4987 lastBreakPos = breakPos; 4988 } 4989 4990 // Find the break positions using the preceding() function. 4991 memset(precedingBreaks, 0, sizeof(precedingBreaks)); 4992 lastBreakPos = testText.length(); 4993 precedingBreaks[testText.length()] = 1; 4994 for (i=testText.length(); i>0; i--) { 4995 breakPos = bi->preceding(i); 4996 if (breakPos >= i || 4997 breakPos > lastBreakPos || 4998 (breakPos < 0 && testText.getChar32Start(i)>0) || 4999 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) { 5000 errln("%s break monkey test: " 5001 "Out of range value returned by BreakIterator::preceding().\n" 5002 "index=%d; prev returned %d; lastBreak=%d" , 5003 name, i, breakPos, lastBreakPos); 5004 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) { 5005 precedingBreaks[i] = 2; // Forces an error. 5006 } 5007 } else { 5008 if (breakPos >= 0) { 5009 precedingBreaks[breakPos] = 1; 5010 } 5011 lastBreakPos = breakPos; 5012 } 5013 } 5014 5015 // Compare the expected and actual results. 5016 for (i=0; i<=testText.length(); i++) { 5017 const char *errorType = NULL; 5018 if (forwardBreaks[i] != expectedBreaks[i]) { 5019 errorType = "next()"; 5020 } else if (reverseBreaks[i] != forwardBreaks[i]) { 5021 errorType = "previous()"; 5022 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) { 5023 errorType = "isBoundary()"; 5024 } else if (followingBreaks[i] != expectedBreaks[i]) { 5025 errorType = "following()"; 5026 } else if (precedingBreaks[i] != expectedBreaks[i]) { 5027 errorType = "preceding()"; 5028 } 5029 5030 5031 if (errorType != NULL) { 5032 // Format a range of the test text that includes the failure as 5033 // a data item that can be included in the rbbi test data file. 5034 5035 // Start of the range is the last point where expected and actual results 5036 // both agreed that there was a break position. 5037 int startContext = i; 5038 int32_t count = 0; 5039 for (;;) { 5040 if (startContext==0) { break; } 5041 startContext --; 5042 if (expectedBreaks[startContext] != 0) { 5043 if (count == 2) break; 5044 count ++; 5045 } 5046 } 5047 5048 // End of range is two expected breaks past the start position. 5049 int endContext = i + 1; 5050 int ci; 5051 for (ci=0; ci<2; ci++) { // Number of items to include in error text. 5052 for (;;) { 5053 if (endContext >= testText.length()) {break;} 5054 if (expectedBreaks[endContext-1] != 0) { 5055 if (count == 0) break; 5056 count --; 5057 } 5058 endContext ++; 5059 } 5060 } 5061 5062 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>" 5063 UnicodeString errorText = "<data>"; 5064 /***if (strcmp(errorType, "next()") == 0) { 5065 startContext = 0; 5066 endContext = testText.length(); 5067 5068 printStringBreaks(testText, expected, expectedCount); 5069 }***/ 5070 5071 for (ci=startContext; ci<endContext;) { 5072 UnicodeString hexChars("0123456789abcdef"); 5073 UChar32 c; 5074 int bn; 5075 c = testText.char32At(ci); 5076 if (ci == i) { 5077 // This is the location of the error. 5078 errorText.append("<?>"); 5079 } else if (expectedBreaks[ci] != 0) { 5080 // This a non-error expected break position. 5081 errorText.append("\\"); 5082 } 5083 if (c < 0x10000) { 5084 errorText.append("\\u"); 5085 for (bn=12; bn>=0; bn-=4) { 5086 errorText.append(hexChars.charAt((c>>bn)&0xf)); 5087 } 5088 } else { 5089 errorText.append("\\U"); 5090 for (bn=28; bn>=0; bn-=4) { 5091 errorText.append(hexChars.charAt((c>>bn)&0xf)); 5092 } 5093 } 5094 ci = testText.moveIndex32(ci, 1); 5095 } 5096 errorText.append("\\"); 5097 errorText.append("</data>\n"); 5098 5099 // Output the error 5100 char charErrorTxt[500]; 5101 UErrorCode status = U_ZERO_ERROR; 5102 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status); 5103 charErrorTxt[sizeof(charErrorTxt)-1] = 0; 5104 errln("%s break monkey test error. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s", 5105 name, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"), 5106 errorType, seed, i, charErrorTxt); 5107 break; 5108 } 5109 } 5110 5111 loopCount++; 5112 } 5113 #endif 5114 } 5115 5116 5117 // Bug 5532. UTF-8 based UText fails in dictionary code. 5118 // This test checks the initial patch, 5119 // which is to just keep it from crashing. Correct word boundaries 5120 // await a proper fix to the dictionary code. 5121 // 5122 void RBBITest::TestBug5532(void) { 5123 // Text includes a mixture of Thai and Latin. 5124 const unsigned char utf8Data[] = { 5125 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u, 5126 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u, 5127 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u, 5128 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 5129 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u, 5130 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u, 5131 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu, 5132 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u, 5133 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 5134 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u, 5135 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00}; 5136 5137 UErrorCode status = U_ZERO_ERROR; 5138 UText utext=UTEXT_INITIALIZER; 5139 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status); 5140 TEST_ASSERT_SUCCESS(status); 5141 5142 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status); 5143 TEST_ASSERT_SUCCESS(status); 5144 if (U_SUCCESS(status)) { 5145 bi->setText(&utext, status); 5146 TEST_ASSERT_SUCCESS(status); 5147 5148 int32_t breakCount = 0; 5149 int32_t previousBreak = -1; 5150 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) { 5151 // For now, just make sure that the break iterator doesn't hang. 5152 TEST_ASSERT(previousBreak < bi->current()); 5153 previousBreak = bi->current(); 5154 } 5155 TEST_ASSERT(breakCount > 0); 5156 } 5157 delete bi; 5158 utext_close(&utext); 5159 } 5160 5161 5162 // 5163 // TestDebug - A place-holder test for debugging purposes. 5164 // For putting in fragments of other tests that can be invoked 5165 // for tracing without a lot of unwanted extra stuff happening. 5166 // 5167 void RBBITest::TestDebug(void) { 5168 #if 0 5169 UErrorCode status = U_ZERO_ERROR; 5170 int pos = 0; 5171 int ruleStatus = 0; 5172 5173 RuleBasedBreakIterator* bi = 5174 // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); 5175 // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status); 5176 (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status); 5177 UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e"); 5178 // UnicodeString s("Aaa. Bcd"); 5179 s = s.unescape(); 5180 bi->setText(s); 5181 UBool r = bi->isBoundary(8); 5182 printf("%s", r?"true":"false"); 5183 return; 5184 pos = bi->last(); 5185 do { 5186 // ruleStatus = bi->getRuleStatus(); 5187 printf("%d\t%d\n", pos, ruleStatus); 5188 pos = bi->previous(); 5189 } while (pos != BreakIterator::DONE); 5190 #endif 5191 } 5192 5193 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 5194