1 /******************************************************************** 2 * COPYRIGHT: 3 * Copyright (c) 1999-2009, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ********************************************************************/ 6 /************************************************************************ 7 * Date Name Description 8 * 12/15/99 Madhu Creation. 9 * 01/12/2000 Madhu Updated for changed API and added new tests 10 ************************************************************************/ 11 12 #include "unicode/utypes.h" 13 14 #if !UCONFIG_NO_BREAK_ITERATION 15 16 #include "unicode/utypes.h" 17 #include "unicode/brkiter.h" 18 #include "unicode/rbbi.h" 19 #include "unicode/uchar.h" 20 #include "unicode/utf16.h" 21 #include "unicode/ucnv.h" 22 #include "unicode/schriter.h" 23 #include "unicode/uniset.h" 24 #include "unicode/regex.h" // TODO: make conditional on regexp being built. 25 #include "unicode/ustring.h" 26 #include "unicode/utext.h" 27 #include "intltest.h" 28 #include "rbbitst.h" 29 #include <string.h> 30 #include "uvector.h" 31 #include "uvectr32.h" 32 #include "triedict.h" 33 #include <string.h> 34 #include <stdio.h> 35 #include <stdlib.h> 36 #include "unicode/numfmt.h" 37 #include "unicode/uscript.h" 38 39 #define TEST_ASSERT(x) {if (!(x)) { \ 40 errln("Failure in file %s, line %d", __FILE__, __LINE__);}} 41 42 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \ 43 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}} 44 45 46 //--------------------------------------------- 47 // runIndexedTest 48 //--------------------------------------------- 49 50 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params ) 51 { 52 if (exec) logln("TestSuite RuleBasedBreakIterator: "); 53 54 switch (index) { 55 case 0: name = "TestBug4153072"; 56 if(exec) TestBug4153072(); break; 57 case 1: name = "TestJapaneseLineBreak"; 58 if(exec) TestJapaneseLineBreak(); break; 59 case 2: name = "TestStatusReturn"; 60 if(exec) TestStatusReturn(); break; 61 case 3: name = "TestUnicodeFiles"; 62 if(exec) TestUnicodeFiles(); break; 63 case 4: name = "TestEmptyString"; 64 if(exec) TestEmptyString(); break; 65 66 case 5: name = "TestGetAvailableLocales"; 67 if(exec) TestGetAvailableLocales(); break; 68 69 case 6: name = "TestGetDisplayName"; 70 if(exec) TestGetDisplayName(); break; 71 72 case 7: name = "TestEndBehaviour"; 73 if(exec) TestEndBehaviour(); break; 74 case 8: name = "TestMixedThaiLineBreak"; 75 if(exec) TestMixedThaiLineBreak(); break; 76 case 9: name = "TestThaiLineBreak"; 77 if(exec) TestThaiLineBreak(); break; 78 case 10: name = "TestMaiyamok"; 79 if(exec) TestMaiyamok(); break; 80 case 11: name = "TestWordBreaks"; 81 if(exec) TestWordBreaks(); break; 82 case 12: name = "TestWordBoundary"; 83 if(exec) TestWordBoundary(); break; 84 case 13: name = "TestLineBreaks"; 85 if(exec) TestLineBreaks(); break; 86 case 14: name = "TestSentBreaks"; 87 if(exec) TestSentBreaks(); break; 88 case 15: name = "TestExtended"; 89 if(exec) TestExtended(); break; 90 case 16: name = "TestMonkey"; 91 if(exec) { 92 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 93 TestMonkey(params); 94 #else 95 logln("skipping TestMonkey (UCONFIG_NO_REGULAR_EXPRESSIONS)"); 96 #endif 97 } 98 break; 99 case 17: name = "TestBug3818"; 100 if(exec) TestBug3818(); break; 101 case 18: name = "TestJapaneseWordBreak"; 102 if(exec) TestJapaneseWordBreak(); break; 103 case 19: name = "TestDebug"; 104 if(exec) TestDebug(); break; 105 case 20: name = "TestTrieDict"; 106 if(exec) TestTrieDict(); break; 107 case 21: name = "TestBug5775"; 108 if (exec) TestBug5775(); break; 109 case 22: name = "TestThaiBreaks"; 110 if (exec) TestThaiBreaks(); break; 111 case 23: name = "TestTailoredBreaks"; 112 if (exec) TestTailoredBreaks(); break; 113 case 24: name = "TestTrieDictWithValue"; 114 if(exec) TestTrieDictWithValue(); break; 115 116 default: name = ""; break; //needed to end loop 117 } 118 } 119 120 121 //--------------------------------------------------------------------------- 122 // 123 // class BITestData Holds a set of Break iterator test data and results 124 // Includes 125 // - the string data to be broken 126 // - a vector of the expected break positions. 127 // - a vector of source line numbers for the data, 128 // (to help see where errors occured.) 129 // - The expected break tag values. 130 // - Vectors of actual break positions and tag values. 131 // - Functions for comparing actual with expected and 132 // reporting errors. 133 // 134 //---------------------------------------------------------------------------- 135 class BITestData { 136 public: 137 UnicodeString fDataToBreak; 138 UVector fExpectedBreakPositions; 139 UVector fExpectedTags; 140 UVector fLineNum; 141 UVector fActualBreakPositions; // Test Results. 142 UVector fActualTags; 143 144 BITestData(UErrorCode &status); 145 void addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status); 146 void checkResults(const char *heading, RBBITest *test); 147 void err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx); 148 void clearResults(); 149 }; 150 151 // 152 // Constructor. 153 // 154 BITestData::BITestData(UErrorCode &status) 155 : fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fActualBreakPositions(status), 156 fActualTags(status) 157 { 158 } 159 160 // 161 // addDataChunk. Add a section (non-breaking) piece if data to the test data. 162 // The macro form collects the line number, which is helpful 163 // when tracking down failures. 164 // 165 // A null data item is inserted at the start of each test's data 166 // to put the starting zero into the data list. The position saved for 167 // each non-null item is its ending position. 168 // 169 #define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status); 170 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) { 171 if (U_FAILURE(status)) {return;} 172 if (data != NULL) { 173 fDataToBreak.append(CharsToUnicodeString(data)); 174 } 175 fExpectedBreakPositions.addElement(fDataToBreak.length(), status); 176 fExpectedTags.addElement(tag, status); 177 fLineNum.addElement(lineNum, status); 178 } 179 180 181 // 182 // checkResults. Compare the actual and expected break positions, report any differences. 183 // 184 void BITestData::checkResults(const char *heading, RBBITest *test) { 185 int32_t expectedIndex = 0; 186 int32_t actualIndex = 0; 187 188 for (;;) { 189 // If we've run through both the expected and actual results vectors, we're done. 190 // break out of the loop. 191 if (expectedIndex >= fExpectedBreakPositions.size() && 192 actualIndex >= fActualBreakPositions.size()) { 193 break; 194 } 195 196 197 if (expectedIndex >= fExpectedBreakPositions.size()) { 198 err(heading, test, expectedIndex-1, actualIndex); 199 actualIndex++; 200 continue; 201 } 202 203 if (actualIndex >= fActualBreakPositions.size()) { 204 err(heading, test, expectedIndex, actualIndex-1); 205 expectedIndex++; 206 continue; 207 } 208 209 if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) { 210 err(heading, test, expectedIndex, actualIndex); 211 // Try to resync the positions of the indices, to avoid a rash of spurious erros. 212 if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) { 213 actualIndex++; 214 } else { 215 expectedIndex++; 216 } 217 continue; 218 } 219 220 if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) { 221 test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d", 222 heading, fLineNum.elementAt(expectedIndex), 223 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex)); 224 } 225 226 actualIndex++; 227 expectedIndex++; 228 } 229 } 230 231 // 232 // err - An error was found. Report it, along with information about where the 233 // incorrectly broken test data appeared in the source file. 234 // 235 void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx) 236 { 237 int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx); 238 int32_t actual = fActualBreakPositions.elementAti(actualIdx); 239 int32_t o = 0; 240 int32_t line = fLineNum.elementAti(expectedIdx); 241 if (expectedIdx > 0) { 242 // The line numbers are off by one because a premature break occurs somewhere 243 // within the previous item, rather than at the start of the current (expected) item. 244 // We want to report the offset of the unexpected break from the start of 245 // this previous item. 246 o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1); 247 } 248 if (actual < expected) { 249 test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading, o, line, actual, expected); 250 } else { 251 test->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading, line, actual, expected); 252 } 253 } 254 255 256 void BITestData::clearResults() { 257 fActualBreakPositions.removeAllElements(); 258 fActualTags.removeAllElements(); 259 } 260 261 262 //----------------------------------------------------------------------------------- 263 // 264 // Cannned Test Characters 265 // 266 //----------------------------------------------------------------------------------- 267 268 static const UChar cannedTestArray[] = { 269 0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031, 270 0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b, 271 0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2, 272 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3, 273 0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303, 274 0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000, 275 0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f, 276 0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000 277 }; 278 279 static UnicodeString* cannedTestChars = 0; 280 281 #define halfNA "\\u0928\\u094d\\u200d" 282 #define halfSA "\\u0938\\u094d\\u200d" 283 #define halfCHA "\\u091a\\u094d\\u200d" 284 #define halfKA "\\u0915\\u094d\\u200d" 285 #define deadTA "\\u0924\\u094d" 286 287 //-------------------------------------------------------------------------------------- 288 // 289 // RBBITest constructor and destructor 290 // 291 //-------------------------------------------------------------------------------------- 292 293 RBBITest::RBBITest() { 294 UnicodeString temp(cannedTestArray); 295 cannedTestChars = new UnicodeString(); 296 *cannedTestChars += (UChar)0x0000; 297 *cannedTestChars += temp; 298 } 299 300 301 RBBITest::~RBBITest() { 302 delete cannedTestChars; 303 } 304 305 306 static const int T_NUMBER = 100; 307 static const int T_LETTER = 200; 308 static const int T_H_OR_K = 300; 309 static const int T_IDEO = 400; 310 311 312 313 314 315 316 //-------------------------------------------------------------------- 317 //Testing the BreakIterator for devanagari script 318 //-------------------------------------------------------------------- 319 320 #define deadRA "\\u0930\\u094d" /*deadform RA = devanagari RA + virama*/ 321 #define deadPHA "\\u092b\\u094d" /*deadform PHA = devanagari PHA + virama*/ 322 #define deadTTHA "\\u0920\\u094d" 323 #define deadPA "\\u092a\\u094d" 324 #define deadSA "\\u0938\\u094d" 325 #define visarga "\\u0903" /*devanagari visarga looks like a english colon*/ 326 327 328 329 330 331 332 //----------------------------------------------------------------------------------- 333 // 334 // Test for status {tag} return value from break rules. 335 // TODO: a more thorough test. 336 // 337 //----------------------------------------------------------------------------------- 338 void RBBITest::TestStatusReturn() { 339 UnicodeString rulesString1("$Letters = [:L:];\n" 340 "$Numbers = [:N:];\n" 341 "$Letters+{1};\n" 342 "$Numbers+{2};\n" 343 "Help\\ {4}/me\\!;\n" 344 "[^$Letters $Numbers];\n" 345 "!.*;\n", -1, US_INV); 346 UnicodeString testString1 = "abc123..abc Help me Help me!"; 347 // 01234567890123456789012345678 348 int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1}; 349 int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1}; 350 351 UErrorCode status=U_ZERO_ERROR; 352 UParseError parseError; 353 354 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status); 355 if(U_FAILURE(status)) { 356 dataerrln("FAIL : in construction - %s", u_errorName(status)); 357 } else { 358 int32_t pos; 359 int32_t i = 0; 360 bi->setText(testString1); 361 for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) { 362 if (pos != bounds1[i]) { 363 errln("FAIL: expected break at %d, got %d\n", bounds1[i], pos); 364 break; 365 } 366 367 int tag = bi->getRuleStatus(); 368 if (tag != brkStatus[i]) { 369 errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag); 370 break; 371 } 372 i++; 373 } 374 } 375 delete bi; 376 } 377 378 379 static void printStringBreaks(UnicodeString ustr, int expected[], 380 int expectedcount) 381 { 382 UErrorCode status = U_ZERO_ERROR; 383 char name[100]; 384 printf("code alpha extend alphanum type word sent line name\n"); 385 int j; 386 for (j = 0; j < ustr.length(); j ++) { 387 if (expectedcount > 0) { 388 int k; 389 for (k = 0; k < expectedcount; k ++) { 390 if (j == expected[k]) { 391 printf("------------------------------------------------ %d\n", 392 j); 393 } 394 } 395 } 396 UChar32 c = ustr.char32At(j); 397 if (c > 0xffff) { 398 j ++; 399 } 400 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status); 401 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c, 402 u_isUAlphabetic(c), 403 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND), 404 u_isalnum(c), 405 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY, 406 u_charType(c), 407 U_SHORT_PROPERTY_NAME), 408 u_getPropertyValueName(UCHAR_WORD_BREAK, 409 u_getIntPropertyValue(c, 410 UCHAR_WORD_BREAK), 411 U_SHORT_PROPERTY_NAME), 412 u_getPropertyValueName(UCHAR_SENTENCE_BREAK, 413 u_getIntPropertyValue(c, 414 UCHAR_SENTENCE_BREAK), 415 U_SHORT_PROPERTY_NAME), 416 u_getPropertyValueName(UCHAR_LINE_BREAK, 417 u_getIntPropertyValue(c, 418 UCHAR_LINE_BREAK), 419 U_SHORT_PROPERTY_NAME), 420 name); 421 } 422 } 423 424 void RBBITest::TestThaiLineBreak() { 425 UErrorCode status = U_ZERO_ERROR; 426 BITestData thaiLineSelection(status); 427 428 // \u0e2f-- the Thai paiyannoi character-- isn't a letter. It's a symbol that 429 // represents elided letters at the end of a long word. It should be bound to 430 // the end of the word and not treated as an independent punctuation mark. 431 432 433 ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data 434 ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f", 0, status); 435 ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e30", 0, status); 436 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status); 437 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status); 438 // ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32", 0, status); 439 // ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status); 440 ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48", 0, status); 441 // the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us 442 ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e2d\\u0e01", 0, status); 443 ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32", 0, status); 444 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status); 445 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, status); 446 ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, status); 447 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status); 448 449 // the one time where the paiyannoi occurs somewhere other than at the end 450 // of a word is in the Thai abbrevation for "etc.", which both begins and 451 // ends with a paiyannoi 452 ADD_DATACHUNK(thaiLineSelection, "\\u0e2f\\u0e25\\u0e2f", 0, status); 453 ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status); 454 ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status); 455 456 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance( 457 Locale("th"), status); 458 if (U_FAILURE(status)) 459 { 460 errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestThaiLineBreak. - %s", u_errorName(status)); 461 return; 462 } 463 464 generalIteratorTest(*e, thaiLineSelection); 465 delete e; 466 } 467 468 469 470 void RBBITest::TestMixedThaiLineBreak() 471 { 472 UErrorCode status = U_ZERO_ERROR; 473 BITestData thaiLineSelection(status); 474 475 ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data 476 477 478 // @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters 479 // start 480 481 ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status); 482 ADD_DATACHUNK(thaiLineSelection, "\\u0E1E\\u0E38\\u0E17\\u0E18\\u0E28\\u0E31\\u0E01\\u0E23\\u0E32\\u0E0A ", 0, status); 483 ADD_DATACHUNK(thaiLineSelection, "2545 ", 0, status); 484 ADD_DATACHUNK(thaiLineSelection, "\\u0E40\\u0E1B\\u0E47\\u0E19", 0, status); 485 ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status); 486 ADD_DATACHUNK(thaiLineSelection, "\\u0E09\\u0E25\\u0E2D\\u0E07", 0, status); 487 ADD_DATACHUNK(thaiLineSelection, "\\u0E04\\u0E23\\u0E1A", 0, status); 488 ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E2D\\u0E1A ", 0, status); 489 ADD_DATACHUNK(thaiLineSelection, "\"\\u0E52\\u0E52\\u0E50 ", 0, status); 490 ADD_DATACHUNK(thaiLineSelection, "\\u0E1b\\u0E35\" ", 0, status); 491 ADD_DATACHUNK(thaiLineSelection, "\\u0E02\\u0E2d\\u0E07", 0, status); 492 ADD_DATACHUNK(thaiLineSelection, "\\u0E01\\u0E23\\u0E38\\u0E07", 0, status); 493 ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E31\\u0E15\\u0E19\\u0E42\\u0E01\\u0E2A\\u0E34\\u0E19\\u0E17\\u0E23\\u0E4C ", 0, status); 494 ADD_DATACHUNK(thaiLineSelection, "(\\u0E01\\u0E23\\u0E38\\u0E07\\u0E40\\u0E17\\u0E1e\\u0E2F", 0, status); 495 ADD_DATACHUNK(thaiLineSelection, "\\u0E2B\\u0E23\\u0E37\\u0E2D ", 0, status); 496 ADD_DATACHUNK(thaiLineSelection, "Bangkok)", 0, status); 497 498 // @suwit - end of changes 499 500 501 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale("th"), status); 502 if (U_FAILURE(status)) 503 { 504 errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak. - %s", u_errorName(status)); 505 return; 506 } 507 508 509 generalIteratorTest(*e, thaiLineSelection); 510 delete e; 511 } 512 513 514 void RBBITest::TestMaiyamok() 515 { 516 UErrorCode status = U_ZERO_ERROR; 517 BITestData thaiLineSelection(status); 518 ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data 519 // the Thai maiyamok character is a shorthand symbol that means "repeat the previous 520 // word". Instead of appearing as a word unto itself, however, it's kept together 521 // with the word before it 522 ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e1b\\u0e46", 0, status); 523 ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32\\u0e46", 0, status); 524 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status); 525 ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07", 0, status); 526 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e1e", 0, status); 527 ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e25\\u0e30", 0, status); 528 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35", 0, status); 529 ADD_DATACHUNK(thaiLineSelection, "\\u0e22\\u0e07", 0, status); 530 ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status); 531 532 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance( 533 Locale("th"), status); 534 535 if (U_FAILURE(status)) 536 { 537 errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMaiyamok. - %s", u_errorName(status)); 538 return; 539 } 540 generalIteratorTest(*e, thaiLineSelection); 541 delete e; 542 } 543 544 545 546 void RBBITest::TestBug3818() { 547 UErrorCode status = U_ZERO_ERROR; 548 549 // Four Thai words... 550 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 551 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 }; 552 UnicodeString thaiStr(thaiWordData); 553 554 RuleBasedBreakIterator* bi = 555 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status); 556 if (U_FAILURE(status) || bi == NULL) { 557 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status)); 558 return; 559 } 560 bi->setText(thaiStr); 561 562 int32_t startOfSecondWord = bi->following(1); 563 if (startOfSecondWord != 4) { 564 errln("Fail at file %s, line %d expected start of word at 4, got %d", 565 __FILE__, __LINE__, startOfSecondWord); 566 } 567 startOfSecondWord = bi->following(0); 568 if (startOfSecondWord != 4) { 569 errln("Fail at file %s, line %d expected start of word at 4, got %d", 570 __FILE__, __LINE__, startOfSecondWord); 571 } 572 delete bi; 573 } 574 575 576 void RBBITest::TestJapaneseWordBreak() { 577 // TODO: Rewrite this test for a dictionary-based word breaking. 578 #if 0 579 UErrorCode status = U_ZERO_ERROR; 580 BITestData japaneseWordSelection(status); 581 582 ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status); // Break at start of data 583 ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2 584 ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5 585 ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7 586 ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10 587 ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11 588 ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12 589 590 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance( 591 Locale("ja"), status); 592 if (U_FAILURE(status)) 593 { 594 errcheckln(status, "Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n"); 595 return; 596 } 597 598 generalIteratorTest(*e, japaneseWordSelection); 599 delete e; 600 #endif 601 } 602 603 void RBBITest::TestTrieDict() { 604 UErrorCode status = U_ZERO_ERROR; 605 606 // 607 // Open and read the test data file. 608 // 609 const char *testDataDirectory = IntlTest::getSourceTestData(status); 610 char testFileName[1000]; 611 if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen("riwords.txt") + 10 >= sizeof(testFileName)) { 612 errln("Can't open test data. Path too long."); 613 return; 614 } 615 strcpy(testFileName, testDataDirectory); 616 strcat(testFileName, "riwords.txt"); 617 618 // Items needing deleting at the end 619 MutableTrieDictionary *mutableDict = NULL; 620 CompactTrieDictionary *compactDict = NULL; 621 UnicodeSet *breaks = NULL; 622 UChar *testFile = NULL; 623 StringEnumeration *enumer1 = NULL; 624 StringEnumeration *enumer2 = NULL; 625 MutableTrieDictionary *mutable2 = NULL; 626 StringEnumeration *cloneEnum = NULL; 627 CompactTrieDictionary *compact2 = NULL; 628 629 630 const UnicodeString *originalWord = NULL; 631 const UnicodeString *cloneWord = NULL; 632 UChar *current; 633 UChar *word; 634 UChar uc; 635 int32_t wordLen; 636 int32_t wordCount; 637 int32_t testCount; 638 639 int len; 640 testFile = ReadAndConvertFile(testFileName, len, NULL, status); 641 if (U_FAILURE(status)) { 642 goto cleanup; /* something went wrong, error already output */ 643 } 644 645 mutableDict = new MutableTrieDictionary(0x0E1C, status); 646 if (U_FAILURE(status)) { 647 errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status)); 648 goto cleanup; 649 } 650 651 breaks = new UnicodeSet; 652 breaks->add(0x000A); // Line Feed 653 breaks->add(0x000D); // Carriage Return 654 breaks->add(0x2028); // Line Separator 655 breaks->add(0x2029); // Paragraph Separator 656 657 // Now add each non-comment line of the file as a word. 658 current = testFile; 659 word = current; 660 uc = *current++; 661 wordLen = 0; 662 wordCount = 0; 663 664 while (uc) { 665 if (uc == 0x0023) { // #comment line, skip 666 while (uc && !breaks->contains(uc)) { 667 uc = *current++; 668 } 669 } 670 else while (uc && !breaks->contains(uc)) { 671 ++wordLen; 672 uc = *current++; 673 } 674 if (wordLen > 0) { 675 mutableDict->addWord(word, wordLen, status); 676 if (U_FAILURE(status)) { 677 errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status)); 678 goto cleanup; 679 } 680 wordCount += 1; 681 } 682 683 // Find beginning of next line 684 while (uc && breaks->contains(uc)) { 685 uc = *current++; 686 } 687 word = current-1; 688 wordLen = 0; 689 } 690 691 if (wordCount < 50) { 692 errln("Word count (%d) unreasonably small\n", wordCount); 693 goto cleanup; 694 } 695 696 enumer1 = mutableDict->openWords(status); 697 if (U_FAILURE(status)) { 698 errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status)); 699 goto cleanup; 700 } 701 702 testCount = 0; 703 if (wordCount != (testCount = enumer1->count(status))) { 704 errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n", 705 testCount, wordCount, u_errorName(status)); 706 goto cleanup; 707 } 708 709 // Now compact it 710 compactDict = new CompactTrieDictionary(*mutableDict, status); 711 if (U_FAILURE(status)) { 712 errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status)); 713 goto cleanup; 714 } 715 716 enumer2 = compactDict->openWords(status); 717 if (U_FAILURE(status)) { 718 errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status)); 719 goto cleanup; 720 } 721 722 if (wordCount != (testCount = enumer2->count(status))) { 723 errln("CompactTrieDictionary word count (%d) differs from file word count (%d), with status %s\n", 724 testCount, wordCount, u_errorName(status)); 725 goto cleanup; 726 } 727 728 if (enumer1->getDynamicClassID() == enumer2->getDynamicClassID()) { 729 errln("CompactTrieEnumeration and MutableTrieEnumeration ClassIDs are the same"); 730 } 731 delete enumer1; 732 enumer1 = NULL; 733 delete enumer2; 734 enumer2 = NULL; 735 736 // Now un-compact it 737 mutable2 = compactDict->cloneMutable(status); 738 if (U_FAILURE(status)) { 739 errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status)); 740 goto cleanup; 741 } 742 743 cloneEnum = mutable2->openWords(status); 744 if (U_FAILURE(status)) { 745 errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status)); 746 goto cleanup; 747 } 748 749 if (wordCount != (testCount = cloneEnum->count(status))) { 750 errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n", 751 testCount, wordCount, u_errorName(status)); 752 goto cleanup; 753 } 754 755 // Compact original dictionary to clone. Note that we can only compare the same kind of 756 // dictionary as the order of the enumerators is not guaranteed to be the same between 757 // different kinds 758 enumer1 = mutableDict->openWords(status); 759 if (U_FAILURE(status)) { 760 errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status)); 761 goto cleanup; 762 } 763 764 originalWord = enumer1->snext(status); 765 cloneWord = cloneEnum->snext(status); 766 while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) { 767 if (*originalWord != *cloneWord) { 768 errln("Original and cloned MutableTrieDictionary word mismatch\n"); 769 goto cleanup; 770 } 771 originalWord = enumer1->snext(status); 772 cloneWord = cloneEnum->snext(status); 773 } 774 775 if (U_FAILURE(status)) { 776 errln("Enumeration failed: %s\n", u_errorName(status)); 777 goto cleanup; 778 } 779 780 if (originalWord != cloneWord) { 781 errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n"); 782 goto cleanup; 783 } 784 785 // Test the data copying constructor for CompactTrieDict, and the data access APIs. 786 compact2 = new CompactTrieDictionary(compactDict->data(), status); 787 if (U_FAILURE(status)) { 788 errln("CompactTrieDictionary(const void *,...) failed\n"); 789 goto cleanup; 790 } 791 792 if (compact2->dataSize() == 0) { 793 errln("CompactTrieDictionary->dataSize() == 0\n"); 794 goto cleanup; 795 } 796 797 // Now count the words via the second dictionary 798 delete enumer1; 799 enumer1 = compact2->openWords(status); 800 if (U_FAILURE(status)) { 801 errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status)); 802 goto cleanup; 803 } 804 805 if (wordCount != (testCount = enumer1->count(status))) { 806 errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n", 807 testCount, wordCount, u_errorName(status)); 808 goto cleanup; 809 } 810 811 cleanup: 812 delete compactDict; 813 delete mutableDict; 814 delete breaks; 815 delete[] testFile; 816 delete enumer1; 817 delete mutable2; 818 delete cloneEnum; 819 delete compact2; 820 } 821 822 /*TODO: delete later*/ 823 inline void writeEnumerationToFile(StringEnumeration *enumer, char *filename){ 824 UErrorCode status = U_ZERO_ERROR; 825 FILE *outfile = fopen(filename,"w"); 826 UConverter *cvt = ucnv_open("UTF-8", &status); 827 if (U_FAILURE(status)) 828 return; 829 if(outfile != NULL){ 830 status = U_ZERO_ERROR; 831 const UnicodeString *word = enumer->snext(status); 832 while (word != NULL && U_SUCCESS(status)) { 833 char u8word[500]; 834 status = U_ZERO_ERROR; 835 ucnv_fromUChars(cvt, u8word, 500, word->getBuffer(), word->length(), 836 &status); 837 fprintf(outfile,"%s\n", u8word); 838 status = U_ZERO_ERROR; 839 word = enumer->snext(status); 840 } 841 fclose(outfile); 842 } 843 ucnv_close(cvt); 844 } 845 846 // A very simple helper class to streamline the buffer handling in 847 // TestTrieDictWithValue 848 template<class T, size_t N> 849 class AutoBuffer { 850 public: 851 AutoBuffer(size_t size) : buffer(stackBuffer) { 852 if (size > N) 853 buffer = new T[size]; 854 } 855 ~AutoBuffer() { 856 if (buffer != stackBuffer) 857 delete [] buffer; 858 } 859 T* elems() { 860 return buffer; 861 } 862 const T& operator[] (size_t i) const { 863 return buffer[i]; 864 } 865 T& operator[] (size_t i) { 866 return buffer[i]; 867 } 868 private: 869 T stackBuffer[N]; 870 T* buffer; 871 AutoBuffer(); 872 }; 873 874 //---------------------------------------------------------------------------- 875 // 876 // TestTrieDictWithValue Test trie dictionaries with logprob values and 877 // more than 2^16 nodes after compaction. 878 // 879 //---------------------------------------------------------------------------- 880 void RBBITest::TestTrieDictWithValue() { 881 UErrorCode status = U_ZERO_ERROR; 882 883 // 884 // Open and read the test data file. 885 // 886 const char *testDataDirectory = IntlTest::getSourceTestData(status); 887 const char *filename = "cjdict-truncated.txt"; 888 char testFileName[1000]; 889 if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen(filename) + 10 >= sizeof(testFileName)) { 890 errln("Can't open test data. Path too long."); 891 return; 892 } 893 strcpy(testFileName, testDataDirectory); 894 strcat(testFileName, filename); 895 896 // Items needing deleting at the end 897 MutableTrieDictionary *mutableDict = NULL; 898 CompactTrieDictionary *compactDict = NULL; 899 UnicodeSet *breaks = NULL; 900 UChar *testFile = NULL; 901 StringEnumeration *enumer1 = NULL; 902 StringEnumeration *enumer2 = NULL; 903 MutableTrieDictionary *mutable2 = NULL; 904 StringEnumeration *cloneEnum = NULL; 905 CompactTrieDictionary *compact2 = NULL; 906 NumberFormat *nf = NULL; 907 UText *originalText = NULL, *cloneText = NULL; 908 909 const UnicodeString *originalWord = NULL; 910 const UnicodeString *cloneWord = NULL; 911 UChar *current; 912 UChar *word; 913 UChar uc; 914 int32_t wordLen; 915 int32_t wordCount; 916 int32_t testCount; 917 int32_t valueLen; 918 int counter = 0; 919 920 int len; 921 testFile = ReadAndConvertFile(testFileName, len, NULL, status); 922 if (U_FAILURE(status)) { 923 goto cleanup; /* something went wrong, error already output */ 924 } 925 926 mutableDict = new MutableTrieDictionary(0x0E1C, status, TRUE); 927 if (U_FAILURE(status)) { 928 errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status)); 929 goto cleanup; 930 } 931 932 breaks = new UnicodeSet; 933 breaks->add(0x000A); // Line Feed 934 breaks->add(0x000D); // Carriage Return 935 breaks->add(0x2028); // Line Separator 936 breaks->add(0x2029); // Paragraph Separator 937 breaks->add(0x0009); // Tab character 938 939 // Now add each non-comment line of the file as a word. 940 current = testFile; 941 word = current; 942 uc = *current++; 943 wordLen = 0; 944 wordCount = 0; 945 nf = NumberFormat::createInstance(status); 946 947 while (uc) { 948 UnicodeString ucharValue; 949 valueLen = 0; 950 951 if (uc == 0x0023) { // #comment line, skip 952 while (uc && !breaks->contains(uc)) { 953 uc = *current++; 954 } 955 } 956 else{ 957 while (uc && !breaks->contains(uc)) { 958 ++wordLen; 959 uc = *current++; 960 } 961 if(uc == 0x0009){ //separator is a tab char, read in num after tab 962 uc = *current++; 963 while (uc && !breaks->contains(uc)) { 964 ucharValue.append(uc); 965 uc = *current++; 966 } 967 } 968 } 969 if (wordLen > 0) { 970 Formattable value((int32_t)0); 971 nf->parse(ucharValue.getTerminatedBuffer(), value, status); 972 973 if(U_FAILURE(status)){ 974 errln("parsing of value failed when reading in dictionary\n"); 975 goto cleanup; 976 } 977 mutableDict->addWord(word, wordLen, status, value.getLong()); 978 if (U_FAILURE(status)) { 979 errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status)); 980 goto cleanup; 981 } 982 wordCount += 1; 983 } 984 985 // Find beginning of next line 986 while (uc && breaks->contains(uc)) { 987 uc = *current++; 988 } 989 word = current-1; 990 wordLen = 0; 991 } 992 993 if (wordCount < 50) { 994 errln("Word count (%d) unreasonably small\n", wordCount); 995 goto cleanup; 996 } 997 998 enumer1 = mutableDict->openWords(status); 999 if (U_FAILURE(status)) { 1000 errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status)); 1001 goto cleanup; 1002 } 1003 1004 testCount = 0; 1005 if (wordCount != (testCount = enumer1->count(status))) { 1006 errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n", 1007 testCount, wordCount, u_errorName(status)); 1008 goto cleanup; 1009 } 1010 1011 // Now compact it 1012 compactDict = new CompactTrieDictionary(*mutableDict, status); 1013 if (U_FAILURE(status)) { 1014 errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status)); 1015 goto cleanup; 1016 } 1017 1018 enumer2 = compactDict->openWords(status); 1019 if (U_FAILURE(status)) { 1020 errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status)); 1021 goto cleanup; 1022 } 1023 1024 1025 //delete later 1026 // writeEnumerationToFile(enumer1, "/home/jchye/mutable.txt"); 1027 // writeEnumerationToFile(enumer2, "/home/jchye/compact.txt"); 1028 1029 enumer1->reset(status); 1030 enumer2->reset(status); 1031 1032 originalWord = enumer1->snext(status); 1033 cloneWord = enumer2->snext(status); 1034 while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) { 1035 if (*originalWord != *cloneWord) { 1036 errln("MutableTrieDictionary and CompactTrieDictionary word mismatch at %d, lengths are %d and %d\n", 1037 counter, originalWord->length(), cloneWord->length()); 1038 goto cleanup; 1039 } 1040 1041 // check if attached values of the same word in both dictionaries tally 1042 #if 0 1043 int32_t lengths1[originalWord->length()], lengths2[cloneWord->length()]; 1044 uint16_t values1[originalWord->length()], values2[cloneWord->length()]; 1045 #endif 1046 AutoBuffer<int32_t, 20> lengths1(originalWord->length()); 1047 AutoBuffer<int32_t, 20> lengths2(cloneWord->length()); 1048 AutoBuffer<uint16_t, 20> values1(originalWord->length()); 1049 AutoBuffer<uint16_t, 20> values2(cloneWord->length()); 1050 1051 originalText = utext_openConstUnicodeString(originalText, originalWord, &status); 1052 cloneText = utext_openConstUnicodeString(cloneText, cloneWord, &status); 1053 1054 int count1, count2; 1055 mutableDict->matches(originalText, originalWord->length(), lengths1.elems(), count1, originalWord->length(), values1.elems()); 1056 compactDict->matches(cloneText, cloneWord->length(), lengths2.elems(), count2, cloneWord->length(), values2.elems()); 1057 1058 if(values1[count1-1] != values2[count2-1]){ 1059 errln("Values of word %d in MutableTrieDictionary and CompactTrieDictionary do not match, with values %d and %d\n", 1060 counter, values1[count1-1], values2[count2-1]); 1061 goto cleanup; 1062 } 1063 1064 counter++; 1065 originalWord = enumer1->snext(status); 1066 cloneWord = enumer2->snext(status); 1067 } 1068 if (enumer1->getDynamicClassID() == enumer2->getDynamicClassID()) { 1069 errln("CompactTrieEnumeration and MutableTrieEnumeration ClassIDs are the same"); 1070 } 1071 1072 delete enumer1; 1073 enumer1 = NULL; 1074 delete enumer2; 1075 enumer2 = NULL; 1076 1077 // Now un-compact it 1078 mutable2 = compactDict->cloneMutable(status); 1079 if (U_FAILURE(status)) { 1080 errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status)); 1081 goto cleanup; 1082 } 1083 1084 cloneEnum = mutable2->openWords(status); 1085 if (U_FAILURE(status)) { 1086 errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status)); 1087 goto cleanup; 1088 } 1089 1090 if (wordCount != (testCount = cloneEnum->count(status))) { 1091 errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n", 1092 testCount, wordCount, u_errorName(status)); 1093 goto cleanup; 1094 } 1095 1096 // Compact original dictionary to clone. Note that we can only compare the same kind of 1097 // dictionary as the order of the enumerators is not guaranteed to be the same between 1098 // different kinds 1099 enumer1 = mutableDict->openWords(status); 1100 if (U_FAILURE(status)) { 1101 errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status)); 1102 goto cleanup; 1103 } 1104 1105 counter = 0; 1106 originalWord = enumer1->snext(status); 1107 cloneWord = cloneEnum->snext(status); 1108 while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) { 1109 if (*originalWord != *cloneWord) { 1110 errln("Original and cloned MutableTrieDictionary word mismatch\n"); 1111 goto cleanup; 1112 } 1113 1114 // check if attached values of the same word in both dictionaries tally 1115 AutoBuffer<int32_t, 20> lengths1(originalWord->length()); 1116 AutoBuffer<int32_t, 20> lengths2(cloneWord->length()); 1117 AutoBuffer<uint16_t, 20> values1(originalWord->length()); 1118 AutoBuffer<uint16_t, 20> values2(cloneWord->length()); 1119 originalText = utext_openConstUnicodeString(originalText, originalWord, &status); 1120 cloneText = utext_openConstUnicodeString(cloneText, cloneWord, &status); 1121 1122 int count1, count2; 1123 mutableDict->matches(originalText, originalWord->length(), lengths1.elems(), count1, originalWord->length(), values1.elems()); 1124 mutable2->matches(cloneText, cloneWord->length(), lengths2.elems(), count2, cloneWord->length(), values2.elems()); 1125 1126 if(values1[count1-1] != values2[count2-1]){ 1127 errln("Values of word %d in original and cloned MutableTrieDictionary do not match, with values %d and %d\n", 1128 counter, values1[count1-1], values2[count2-1]); 1129 goto cleanup; 1130 } 1131 1132 counter++; 1133 1134 originalWord = enumer1->snext(status); 1135 cloneWord = cloneEnum->snext(status); 1136 } 1137 1138 if (U_FAILURE(status)) { 1139 errln("Enumeration failed: %s\n", u_errorName(status)); 1140 goto cleanup; 1141 } 1142 1143 if (originalWord != cloneWord) { 1144 errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n"); 1145 goto cleanup; 1146 } 1147 1148 // Test the data copying constructor for CompactTrieDict, and the data access APIs. 1149 compact2 = new CompactTrieDictionary(compactDict->data(), status); 1150 if (U_FAILURE(status)) { 1151 errln("CompactTrieDictionary(const void *,...) failed\n"); 1152 goto cleanup; 1153 } 1154 1155 if (compact2->dataSize() == 0) { 1156 errln("CompactTrieDictionary->dataSize() == 0\n"); 1157 goto cleanup; 1158 } 1159 1160 // Now count the words via the second dictionary 1161 delete enumer1; 1162 enumer1 = compact2->openWords(status); 1163 if (U_FAILURE(status)) { 1164 errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status)); 1165 goto cleanup; 1166 } 1167 1168 if (wordCount != (testCount = enumer1->count(status))) { 1169 errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n", 1170 testCount, wordCount, u_errorName(status)); 1171 goto cleanup; 1172 } 1173 1174 cleanup: 1175 delete compactDict; 1176 delete mutableDict; 1177 delete breaks; 1178 delete[] testFile; 1179 delete enumer1; 1180 delete mutable2; 1181 delete cloneEnum; 1182 delete compact2; 1183 utext_close(originalText); 1184 utext_close(cloneText); 1185 1186 1187 } 1188 1189 //---------------------------------------------------------------------------- 1190 // 1191 // generalIteratorTest Given a break iterator and a set of test data, 1192 // Run the tests and report the results. 1193 // 1194 //---------------------------------------------------------------------------- 1195 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td) 1196 { 1197 1198 bi.setText(td.fDataToBreak); 1199 1200 testFirstAndNext(bi, td); 1201 1202 testLastAndPrevious(bi, td); 1203 1204 testFollowing(bi, td); 1205 testPreceding(bi, td); 1206 testIsBoundary(bi, td); 1207 doMultipleSelectionTest(bi, td); 1208 } 1209 1210 1211 // 1212 // testFirstAndNext. Run the iterator forwards in the obvious first(), next() 1213 // kind of loop. 1214 // 1215 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td) 1216 { 1217 UErrorCode status = U_ZERO_ERROR; 1218 int32_t p; 1219 int32_t lastP = -1; 1220 int32_t tag; 1221 1222 logln("Test first and next"); 1223 bi.setText(td.fDataToBreak); 1224 td.clearResults(); 1225 1226 for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) { 1227 td.fActualBreakPositions.addElement(p, status); // Save result. 1228 tag = bi.getRuleStatus(); 1229 td.fActualTags.addElement(tag, status); 1230 if (p <= lastP) { 1231 // If the iterator is not making forward progress, stop. 1232 // No need to raise an error here, it'll be detected in the normal check of results. 1233 break; 1234 } 1235 lastP = p; 1236 } 1237 td.checkResults("testFirstAndNext", this); 1238 } 1239 1240 1241 // 1242 // TestLastAndPrevious. Run the iterator backwards, starting with last(). 1243 // 1244 void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td) 1245 { 1246 UErrorCode status = U_ZERO_ERROR; 1247 int32_t p; 1248 int32_t lastP = 0x7ffffffe; 1249 int32_t tag; 1250 1251 logln("Test last and previous"); 1252 bi.setText(td.fDataToBreak); 1253 td.clearResults(); 1254 1255 for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) { 1256 // Save break position. Insert it at start of vector of results, shoving 1257 // already-saved results further towards the end. 1258 td.fActualBreakPositions.insertElementAt(p, 0, status); 1259 // bi.previous(); // TODO: Why does this fix things up???? 1260 // bi.next(); 1261 tag = bi.getRuleStatus(); 1262 td.fActualTags.insertElementAt(tag, 0, status); 1263 if (p >= lastP) { 1264 // If the iterator is not making progress, stop. 1265 // No need to raise an error here, it'll be detected in the normal check of results. 1266 break; 1267 } 1268 lastP = p; 1269 } 1270 td.checkResults("testLastAndPrevious", this); 1271 } 1272 1273 1274 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td) 1275 { 1276 UErrorCode status = U_ZERO_ERROR; 1277 int32_t p; 1278 int32_t tag; 1279 int32_t lastP = -2; // A value that will never be returned as a break position. 1280 // cannot be -1; that is returned for DONE. 1281 int i; 1282 1283 logln("testFollowing():"); 1284 bi.setText(td.fDataToBreak); 1285 td.clearResults(); 1286 1287 // Save the starting point, since we won't get that out of following. 1288 p = bi.first(); 1289 td.fActualBreakPositions.addElement(p, status); // Save result. 1290 tag = bi.getRuleStatus(); 1291 td.fActualTags.addElement(tag, status); 1292 1293 for (i = 0; i <= td.fDataToBreak.length()+1; i++) { 1294 p = bi.following(i); 1295 if (p != lastP) { 1296 if (p == RuleBasedBreakIterator::DONE) { 1297 break; 1298 } 1299 // We've reached a new break position. Save it. 1300 td.fActualBreakPositions.addElement(p, status); // Save result. 1301 tag = bi.getRuleStatus(); 1302 td.fActualTags.addElement(tag, status); 1303 lastP = p; 1304 } 1305 } 1306 // The loop normally exits by means of the break in the middle. 1307 // Make sure that the index was at the correct position for the break iterator to have 1308 // returned DONE. 1309 if (i != td.fDataToBreak.length()) { 1310 errln("testFollowing(): iterator returned DONE prematurely."); 1311 } 1312 1313 // Full check of all results. 1314 td.checkResults("testFollowing", this); 1315 } 1316 1317 1318 1319 void RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) { 1320 UErrorCode status = U_ZERO_ERROR; 1321 int32_t p; 1322 int32_t tag; 1323 int32_t lastP = 0x7ffffffe; 1324 int i; 1325 1326 logln("testPreceding():"); 1327 bi.setText(td.fDataToBreak); 1328 td.clearResults(); 1329 1330 p = bi.last(); 1331 td.fActualBreakPositions.addElement(p, status); 1332 tag = bi.getRuleStatus(); 1333 td.fActualTags.addElement(tag, status); 1334 1335 for (i = td.fDataToBreak.length(); i>=-1; i--) { 1336 p = bi.preceding(i); 1337 if (p != lastP) { 1338 if (p == RuleBasedBreakIterator::DONE) { 1339 break; 1340 } 1341 // We've reached a new break position. Save it. 1342 td.fActualBreakPositions.insertElementAt(p, 0, status); 1343 lastP = p; 1344 tag = bi.getRuleStatus(); 1345 td.fActualTags.insertElementAt(tag, 0, status); 1346 } 1347 } 1348 // The loop normally exits by means of the break in the middle. 1349 // Make sure that the index was at the correct position for the break iterator to have 1350 // returned DONE. 1351 if (i != 0) { 1352 errln("testPreceding(): iterator returned DONE prematurely."); 1353 } 1354 1355 // Full check of all results. 1356 td.checkResults("testPreceding", this); 1357 } 1358 1359 1360 1361 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) { 1362 UErrorCode status = U_ZERO_ERROR; 1363 int i; 1364 int32_t tag; 1365 1366 logln("testIsBoundary():"); 1367 bi.setText(td.fDataToBreak); 1368 td.clearResults(); 1369 1370 for (i = 0; i <= td.fDataToBreak.length(); i++) { 1371 if (bi.isBoundary(i)) { 1372 td.fActualBreakPositions.addElement(i, status); // Save result. 1373 tag = bi.getRuleStatus(); 1374 td.fActualTags.addElement(tag, status); 1375 } 1376 } 1377 td.checkResults("testIsBoundary: ", this); 1378 } 1379 1380 1381 1382 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td) 1383 { 1384 iterator.setText(td.fDataToBreak); 1385 1386 RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone(); 1387 int32_t offset = iterator.first(); 1388 int32_t testOffset; 1389 int32_t count = 0; 1390 1391 logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length()); 1392 1393 if (*testIterator != iterator) 1394 errln("clone() or operator!= failed: two clones compared unequal"); 1395 1396 do { 1397 testOffset = testIterator->first(); 1398 testOffset = testIterator->next(count); 1399 if (offset != testOffset) 1400 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset); 1401 1402 if (offset != RuleBasedBreakIterator::DONE) { 1403 count++; 1404 offset = iterator.next(); 1405 1406 if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) { 1407 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset); 1408 if (count > 10000 || offset == -1) { 1409 errln("operator== failed too many times. Stopping test."); 1410 if (offset == -1) { 1411 errln("Does (RuleBasedBreakIterator::DONE == -1)?"); 1412 } 1413 return; 1414 } 1415 } 1416 } 1417 } while (offset != RuleBasedBreakIterator::DONE); 1418 1419 // now do it backwards... 1420 offset = iterator.last(); 1421 count = 0; 1422 1423 do { 1424 testOffset = testIterator->last(); 1425 testOffset = testIterator->next(count); // next() with a negative arg is same as previous 1426 if (offset != testOffset) 1427 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset); 1428 1429 if (offset != RuleBasedBreakIterator::DONE) { 1430 count--; 1431 offset = iterator.previous(); 1432 } 1433 } while (offset != RuleBasedBreakIterator::DONE); 1434 1435 delete testIterator; 1436 } 1437 1438 1439 //--------------------------------------------- 1440 // 1441 // other tests 1442 // 1443 //--------------------------------------------- 1444 void RBBITest::TestEmptyString() 1445 { 1446 UnicodeString text = ""; 1447 UErrorCode status = U_ZERO_ERROR; 1448 1449 BITestData x(status); 1450 ADD_DATACHUNK(x, "", 0, status); // Break at start of data 1451 RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); 1452 if (U_FAILURE(status)) 1453 { 1454 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status)); 1455 return; 1456 } 1457 generalIteratorTest(*bi, x); 1458 delete bi; 1459 } 1460 1461 void RBBITest::TestGetAvailableLocales() 1462 { 1463 int32_t locCount = 0; 1464 const Locale* locList = BreakIterator::getAvailableLocales(locCount); 1465 1466 if (locCount == 0) 1467 dataerrln("getAvailableLocales() returned an empty list!"); 1468 // Just make sure that it's returning good memory. 1469 int32_t i; 1470 for (i = 0; i < locCount; ++i) { 1471 logln(locList[i].getName()); 1472 } 1473 } 1474 1475 //Testing the BreakIterator::getDisplayName() function 1476 void RBBITest::TestGetDisplayName() 1477 { 1478 UnicodeString result; 1479 1480 BreakIterator::getDisplayName(Locale::getUS(), result); 1481 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)") 1482 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \"" 1483 + result); 1484 1485 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result); 1486 if (result != "French (France)") 1487 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \"" 1488 + result); 1489 } 1490 /** 1491 * Test End Behaviour 1492 * @bug 4068137 1493 */ 1494 void RBBITest::TestEndBehaviour() 1495 { 1496 UErrorCode status = U_ZERO_ERROR; 1497 UnicodeString testString("boo."); 1498 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status); 1499 if (U_FAILURE(status)) 1500 { 1501 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status)); 1502 return; 1503 } 1504 wb->setText(testString); 1505 1506 if (wb->first() != 0) 1507 errln("Didn't get break at beginning of string."); 1508 if (wb->next() != 3) 1509 errln("Didn't get break before period in \"boo.\""); 1510 if (wb->current() != 4 && wb->next() != 4) 1511 errln("Didn't get break at end of string."); 1512 delete wb; 1513 } 1514 /* 1515 * @bug 4153072 1516 */ 1517 void RBBITest::TestBug4153072() { 1518 UErrorCode status = U_ZERO_ERROR; 1519 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status); 1520 if (U_FAILURE(status)) 1521 { 1522 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status)); 1523 return; 1524 } 1525 UnicodeString str("...Hello, World!..."); 1526 int32_t begin = 3; 1527 int32_t end = str.length() - 3; 1528 UBool onBoundary; 1529 1530 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin); 1531 iter->adoptText(textIterator); 1532 int index; 1533 // Note: with the switch to UText, there is no way to restrict the 1534 // iteration range to begin at an index other than zero. 1535 // String character iterators created with a non-zero bound are 1536 // treated by RBBI as being empty. 1537 for (index = -1; index < begin + 1; ++index) { 1538 onBoundary = iter->isBoundary(index); 1539 if (index == 0? !onBoundary : onBoundary) { 1540 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index + 1541 " and begin index = " + begin); 1542 } 1543 } 1544 delete iter; 1545 } 1546 1547 1548 // 1549 // Test for problem reported by Ashok Matoria on 9 July 2007 1550 // One.<kSoftHyphen><kSpace>Two. 1551 // 1552 // Sentence break at start (0) and then on calling next() it breaks at 1553 // 'T' of "Two". Now, at this point if I do next() and 1554 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two". 1555 // 1556 void RBBITest::TestBug5775() { 1557 UErrorCode status = U_ZERO_ERROR; 1558 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status); 1559 TEST_ASSERT_SUCCESS(status); 1560 if (U_FAILURE(status)) { 1561 return; 1562 } 1563 // Check for status first for better handling of no data errors. 1564 TEST_ASSERT(bi != NULL); 1565 if (bi == NULL) { 1566 return; 1567 } 1568 1569 UnicodeString s("One.\\u00ad Two.", -1, US_INV); 1570 // 01234 56789 1571 s = s.unescape(); 1572 bi->setText(s); 1573 int pos = bi->next(); 1574 TEST_ASSERT(pos == 6); 1575 pos = bi->next(); 1576 TEST_ASSERT(pos == 10); 1577 pos = bi->previous(); 1578 TEST_ASSERT(pos == 6); 1579 delete bi; 1580 } 1581 1582 1583 1584 /** 1585 * Test Japanese Line Break 1586 * @bug 4095322 1587 */ 1588 void RBBITest::TestJapaneseLineBreak() 1589 { 1590 #if 0 1591 // Test needs updating some more... Dump it for now. 1592 1593 1594 // Change for Unicode TR 14: Punctuation characters with categories Pi and Pf do not count 1595 // as opening and closing punctuation for line breaking. 1596 // Also, \u30fc and \u30fe are not counted as hyphens. Remove these chars 1597 // from these tests. 6-13-2002 1598 // 1599 UErrorCode status = U_ZERO_ERROR; 1600 UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c"); 1601 UnicodeString precedingChars = CharsToUnicodeString( 1602 //"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f"); 1603 "([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e"); 1604 UnicodeString followingChars = CharsToUnicodeString( 1605 // ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc" 1606 ")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7" 1607 // ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034" 1608 ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034" 1609 "\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302"); 1610 BreakIterator *iter = BreakIterator::createLineInstance(Locale::getJapan(), status); 1611 1612 int32_t i; 1613 if (U_FAILURE(status)) 1614 { 1615 errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n"); 1616 return; 1617 } 1618 1619 for (i = 0; i < precedingChars.length(); i++) { 1620 testString.setCharAt(1, precedingChars[i]); 1621 iter->setText(testString); 1622 int32_t j = iter->first(); 1623 if (j != 0) 1624 errln("ja line break failure: failed to start at 0"); 1625 j = iter->next(); 1626 if (j != 1) 1627 errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars[i]) 1628 + "' (" + ((int)(precedingChars[i])) + ")"); 1629 j = iter->next(); 1630 if (j != 3) 1631 errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars[i]) 1632 + "' (" + ((int)(precedingChars[i])) + ")"); 1633 } 1634 1635 for (i = 0; i < followingChars.length(); i++) { 1636 testString.setCharAt(1, followingChars[i]); 1637 iter->setText(testString); 1638 int j = iter->first(); 1639 if (j != 0) 1640 errln("ja line break failure: failed to start at 0"); 1641 j = iter->next(); 1642 if (j != 2) 1643 errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars[i]) 1644 + "' (" + ((int)(followingChars[i])) + ")"); 1645 j = iter->next(); 1646 if (j != 3) 1647 errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars[i]) 1648 + "' (" + ((int)(followingChars[i])) + ")"); 1649 } 1650 delete iter; 1651 #endif 1652 } 1653 1654 1655 //------------------------------------------------------------------------------ 1656 // 1657 // RBBITest::Extended Run RBBI Tests from an external test data file 1658 // 1659 //------------------------------------------------------------------------------ 1660 1661 struct TestParams { 1662 BreakIterator *bi; 1663 UnicodeString dataToBreak; 1664 UVector32 *expectedBreaks; 1665 UVector32 *srcLine; 1666 UVector32 *srcCol; 1667 }; 1668 1669 void RBBITest::executeTest(TestParams *t) { 1670 int32_t bp; 1671 int32_t prevBP; 1672 int32_t i; 1673 1674 if (t->bi == NULL) { 1675 return; 1676 } 1677 1678 t->bi->setText(t->dataToBreak); 1679 // 1680 // Run the iterator forward 1681 // 1682 prevBP = -1; 1683 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) { 1684 if (prevBP == bp) { 1685 // Fail for lack of forward progress. 1686 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d", 1687 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 1688 break; 1689 } 1690 1691 // Check that there were we didn't miss an expected break between the last one 1692 // and this one. 1693 for (i=prevBP+1; i<bp; i++) { 1694 if (t->expectedBreaks->elementAti(i) != 0) { 1695 int expected[] = {0, i}; 1696 printStringBreaks(t->dataToBreak, expected, 2); 1697 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1698 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 1699 } 1700 } 1701 1702 // Check that the break we did find was expected 1703 if (t->expectedBreaks->elementAti(bp) == 0) { 1704 int expected[] = {0, bp}; 1705 printStringBreaks(t->dataToBreak, expected, 2); 1706 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d", 1707 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 1708 } else { 1709 // The break was expected. 1710 // Check that the {nnn} tag value is correct. 1711 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp); 1712 if (expectedTagVal == -1) { 1713 expectedTagVal = 0; 1714 } 1715 int32_t line = t->srcLine->elementAti(bp); 1716 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus(); 1717 if (rs != expectedTagVal) { 1718 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n" 1719 " Actual, Expected status = %4d, %4d", 1720 bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal); 1721 } 1722 } 1723 1724 1725 prevBP = bp; 1726 } 1727 1728 // Verify that there were no missed expected breaks after the last one found 1729 for (i=prevBP+1; i<t->expectedBreaks->size(); i++) { 1730 if (t->expectedBreaks->elementAti(i) != 0) { 1731 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1732 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 1733 } 1734 } 1735 1736 // 1737 // Run the iterator backwards, verify that the same breaks are found. 1738 // 1739 prevBP = t->dataToBreak.length()+2; // start with a phony value for the last break pos seen. 1740 for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) { 1741 if (prevBP == bp) { 1742 // Fail for lack of progress. 1743 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d", 1744 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 1745 break; 1746 } 1747 1748 // Check that there were we didn't miss an expected break between the last one 1749 // and this one. (UVector returns zeros for index out of bounds.) 1750 for (i=prevBP-1; i>bp; i--) { 1751 if (t->expectedBreaks->elementAti(i) != 0) { 1752 errln("Reverse Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1753 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 1754 } 1755 } 1756 1757 // Check that the break we did find was expected 1758 if (t->expectedBreaks->elementAti(bp) == 0) { 1759 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d", 1760 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 1761 } else { 1762 // The break was expected. 1763 // Check that the {nnn} tag value is correct. 1764 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp); 1765 if (expectedTagVal == -1) { 1766 expectedTagVal = 0; 1767 } 1768 int line = t->srcLine->elementAti(bp); 1769 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus(); 1770 if (rs != expectedTagVal) { 1771 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n" 1772 " Actual, Expected status = %4d, %4d", 1773 bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal); 1774 } 1775 } 1776 1777 prevBP = bp; 1778 } 1779 1780 // Verify that there were no missed breaks prior to the last one found 1781 for (i=prevBP-1; i>=0; i--) { 1782 if (t->expectedBreaks->elementAti(i) != 0) { 1783 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1784 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 1785 } 1786 } 1787 } 1788 1789 1790 void RBBITest::TestExtended() { 1791 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 1792 UErrorCode status = U_ZERO_ERROR; 1793 Locale locale(""); 1794 1795 UnicodeString rules; 1796 TestParams tp; 1797 tp.bi = NULL; 1798 tp.expectedBreaks = new UVector32(status); 1799 tp.srcLine = new UVector32(status); 1800 tp.srcCol = new UVector32(status); 1801 1802 RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status); 1803 if (U_FAILURE(status)) { 1804 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status)); 1805 } 1806 1807 1808 // 1809 // Open and read the test data file. 1810 // 1811 const char *testDataDirectory = IntlTest::getSourceTestData(status); 1812 char testFileName[1000]; 1813 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) { 1814 errln("Can't open test data. Path too long."); 1815 return; 1816 } 1817 strcpy(testFileName, testDataDirectory); 1818 strcat(testFileName, "rbbitst.txt"); 1819 1820 int len; 1821 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status); 1822 if (U_FAILURE(status)) { 1823 return; /* something went wrong, error already output */ 1824 } 1825 1826 1827 1828 1829 // 1830 // Put the test data into a UnicodeString 1831 // 1832 UnicodeString testString(FALSE, testFile, len); 1833 1834 enum EParseState{ 1835 PARSE_COMMENT, 1836 PARSE_TAG, 1837 PARSE_DATA, 1838 PARSE_NUM 1839 } 1840 parseState = PARSE_TAG; 1841 1842 EParseState savedState = PARSE_TAG; 1843 1844 static const UChar CH_LF = 0x0a; 1845 static const UChar CH_CR = 0x0d; 1846 static const UChar CH_HASH = 0x23; 1847 /*static const UChar CH_PERIOD = 0x2e;*/ 1848 static const UChar CH_LT = 0x3c; 1849 static const UChar CH_GT = 0x3e; 1850 static const UChar CH_BACKSLASH = 0x5c; 1851 static const UChar CH_BULLET = 0x2022; 1852 1853 int32_t lineNum = 1; 1854 int32_t colStart = 0; 1855 int32_t column = 0; 1856 int32_t charIdx = 0; 1857 1858 int32_t tagValue = 0; // The numeric value of a <nnn> tag. 1859 1860 for (charIdx = 0; charIdx < len; ) { 1861 status = U_ZERO_ERROR; 1862 UChar c = testString.charAt(charIdx); 1863 charIdx++; 1864 if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) { 1865 // treat CRLF as a unit 1866 c = CH_LF; 1867 charIdx++; 1868 } 1869 if (c == CH_LF || c == CH_CR) { 1870 lineNum++; 1871 colStart = charIdx; 1872 } 1873 column = charIdx - colStart + 1; 1874 1875 switch (parseState) { 1876 case PARSE_COMMENT: 1877 if (c == 0x0a || c == 0x0d) { 1878 parseState = savedState; 1879 } 1880 break; 1881 1882 case PARSE_TAG: 1883 { 1884 if (c == CH_HASH) { 1885 parseState = PARSE_COMMENT; 1886 savedState = PARSE_TAG; 1887 break; 1888 } 1889 if (u_isUWhiteSpace(c)) { 1890 break; 1891 } 1892 if (testString.compare(charIdx-1, 6, "<word>") == 0) { 1893 delete tp.bi; 1894 tp.bi = BreakIterator::createWordInstance(locale, status); 1895 charIdx += 5; 1896 break; 1897 } 1898 if (testString.compare(charIdx-1, 6, "<char>") == 0) { 1899 delete tp.bi; 1900 tp.bi = BreakIterator::createCharacterInstance(locale, status); 1901 charIdx += 5; 1902 break; 1903 } 1904 if (testString.compare(charIdx-1, 6, "<line>") == 0) { 1905 delete tp.bi; 1906 tp.bi = BreakIterator::createLineInstance(locale, status); 1907 charIdx += 5; 1908 break; 1909 } 1910 if (testString.compare(charIdx-1, 6, "<sent>") == 0) { 1911 delete tp.bi; 1912 tp.bi = NULL; 1913 tp.bi = BreakIterator::createSentenceInstance(locale, status); 1914 charIdx += 5; 1915 break; 1916 } 1917 if (testString.compare(charIdx-1, 7, "<title>") == 0) { 1918 delete tp.bi; 1919 tp.bi = BreakIterator::createTitleInstance(locale, status); 1920 charIdx += 6; 1921 break; 1922 } 1923 1924 // <locale loc_name> 1925 localeMatcher.reset(testString); 1926 if (localeMatcher.lookingAt(charIdx-1, status)) { 1927 UnicodeString localeName = localeMatcher.group(1, status); 1928 char localeName8[100]; 1929 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0); 1930 locale = Locale::createFromName(localeName8); 1931 charIdx += localeMatcher.group(0, status).length(); 1932 TEST_ASSERT_SUCCESS(status); 1933 break; 1934 } 1935 if (testString.compare(charIdx-1, 6, "<data>") == 0) { 1936 parseState = PARSE_DATA; 1937 charIdx += 5; 1938 tp.dataToBreak = ""; 1939 tp.expectedBreaks->removeAllElements(); 1940 tp.srcCol ->removeAllElements(); 1941 tp.srcLine->removeAllElements(); 1942 break; 1943 } 1944 1945 errln("line %d: Tag expected in test file.", lineNum); 1946 parseState = PARSE_COMMENT; 1947 savedState = PARSE_DATA; 1948 goto end_test; // Stop the test. 1949 } 1950 break; 1951 1952 case PARSE_DATA: 1953 if (c == CH_BULLET) { 1954 int32_t breakIdx = tp.dataToBreak.length(); 1955 tp.expectedBreaks->setSize(breakIdx+1); 1956 tp.expectedBreaks->setElementAt(-1, breakIdx); 1957 tp.srcLine->setSize(breakIdx+1); 1958 tp.srcLine->setElementAt(lineNum, breakIdx); 1959 tp.srcCol ->setSize(breakIdx+1); 1960 tp.srcCol ->setElementAt(column, breakIdx); 1961 break; 1962 } 1963 1964 if (testString.compare(charIdx-1, 7, "</data>") == 0) { 1965 // Add final entry to mappings from break location to source file position. 1966 // Need one extra because last break position returned is after the 1967 // last char in the data, not at the last char. 1968 tp.srcLine->addElement(lineNum, status); 1969 tp.srcCol ->addElement(column, status); 1970 1971 parseState = PARSE_TAG; 1972 charIdx += 6; 1973 1974 // RUN THE TEST! 1975 executeTest(&tp); 1976 break; 1977 } 1978 1979 if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) { 1980 // Named character, e.g. \N{COMBINING GRAVE ACCENT} 1981 // Get the code point from the name and insert it into the test data. 1982 // (Damn, no API takes names in Unicode !!! 1983 // we've got to take it back to char *) 1984 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx); 1985 int32_t nameLength = nameEndIdx - (charIdx+2); 1986 char charNameBuf[200]; 1987 UChar32 theChar = -1; 1988 if (nameEndIdx != -1) { 1989 UErrorCode status = U_ZERO_ERROR; 1990 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf)); 1991 charNameBuf[sizeof(charNameBuf)-1] = 0; 1992 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status); 1993 if (U_FAILURE(status)) { 1994 theChar = -1; 1995 } 1996 } 1997 if (theChar == -1) { 1998 errln("Error in named character in test file at line %d, col %d", 1999 lineNum, column); 2000 } else { 2001 // Named code point was recognized. Insert it 2002 // into the test data. 2003 tp.dataToBreak.append(theChar); 2004 while (tp.dataToBreak.length() > tp.srcLine->size()) { 2005 tp.srcLine->addElement(lineNum, status); 2006 tp.srcCol ->addElement(column, status); 2007 } 2008 } 2009 if (nameEndIdx > charIdx) { 2010 charIdx = nameEndIdx+1; 2011 2012 } 2013 break; 2014 } 2015 2016 2017 2018 2019 if (testString.compare(charIdx-1, 2, "<>") == 0) { 2020 charIdx++; 2021 int32_t breakIdx = tp.dataToBreak.length(); 2022 tp.expectedBreaks->setSize(breakIdx+1); 2023 tp.expectedBreaks->setElementAt(-1, breakIdx); 2024 tp.srcLine->setSize(breakIdx+1); 2025 tp.srcLine->setElementAt(lineNum, breakIdx); 2026 tp.srcCol ->setSize(breakIdx+1); 2027 tp.srcCol ->setElementAt(column, breakIdx); 2028 break; 2029 } 2030 2031 if (c == CH_LT) { 2032 tagValue = 0; 2033 parseState = PARSE_NUM; 2034 break; 2035 } 2036 2037 if (c == CH_HASH && column==3) { // TODO: why is column off so far? 2038 parseState = PARSE_COMMENT; 2039 savedState = PARSE_DATA; 2040 break; 2041 } 2042 2043 if (c == CH_BACKSLASH) { 2044 // Check for \ at end of line, a line continuation. 2045 // Advance over (discard) the newline 2046 UChar32 cp = testString.char32At(charIdx); 2047 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) { 2048 // We have a CR LF 2049 // Need an extra increment of the input ptr to move over both of them 2050 charIdx++; 2051 } 2052 if (cp == CH_LF || cp == CH_CR) { 2053 lineNum++; 2054 colStart = charIdx; 2055 charIdx++; 2056 break; 2057 } 2058 2059 // Let unescape handle the back slash. 2060 cp = testString.unescapeAt(charIdx); 2061 if (cp != -1) { 2062 // Escape sequence was recognized. Insert the char 2063 // into the test data. 2064 tp.dataToBreak.append(cp); 2065 while (tp.dataToBreak.length() > tp.srcLine->size()) { 2066 tp.srcLine->addElement(lineNum, status); 2067 tp.srcCol ->addElement(column, status); 2068 } 2069 break; 2070 } 2071 2072 2073 // Not a recognized backslash escape sequence. 2074 // Take the next char as a literal. 2075 // TODO: Should this be an error? 2076 c = testString.charAt(charIdx); 2077 charIdx = testString.moveIndex32(charIdx, 1); 2078 } 2079 2080 // Normal, non-escaped data char. 2081 tp.dataToBreak.append(c); 2082 2083 // Save the mapping from offset in the data to line/column numbers in 2084 // the original input file. Will be used for better error messages only. 2085 // If there's an expected break before this char, the slot in the mapping 2086 // vector will already be set for this char; don't overwrite it. 2087 if (tp.dataToBreak.length() > tp.srcLine->size()) { 2088 tp.srcLine->addElement(lineNum, status); 2089 tp.srcCol ->addElement(column, status); 2090 } 2091 break; 2092 2093 2094 case PARSE_NUM: 2095 // We are parsing an expected numeric tag value, like <1234>, 2096 // within a chunk of data. 2097 if (u_isUWhiteSpace(c)) { 2098 break; 2099 } 2100 2101 if (c == CH_GT) { 2102 // Finished the number. Add the info to the expected break data, 2103 // and switch parse state back to doing plain data. 2104 parseState = PARSE_DATA; 2105 if (tagValue == 0) { 2106 tagValue = -1; 2107 } 2108 int32_t breakIdx = tp.dataToBreak.length(); 2109 tp.expectedBreaks->setSize(breakIdx+1); 2110 tp.expectedBreaks->setElementAt(tagValue, breakIdx); 2111 tp.srcLine->setSize(breakIdx+1); 2112 tp.srcLine->setElementAt(lineNum, breakIdx); 2113 tp.srcCol ->setSize(breakIdx+1); 2114 tp.srcCol ->setElementAt(column, breakIdx); 2115 break; 2116 } 2117 2118 if (u_isdigit(c)) { 2119 tagValue = tagValue*10 + u_charDigitValue(c); 2120 break; 2121 } 2122 2123 errln("Syntax Error in test file at line %d, col %d", 2124 lineNum, column); 2125 parseState = PARSE_COMMENT; 2126 goto end_test; // Stop the test 2127 break; 2128 } 2129 2130 2131 if (U_FAILURE(status)) { 2132 errln("ICU Error %s while parsing test file at line %d.", 2133 u_errorName(status), lineNum); 2134 status = U_ZERO_ERROR; 2135 goto end_test; // Stop the test 2136 } 2137 2138 } 2139 2140 end_test: 2141 delete tp.bi; 2142 delete tp.expectedBreaks; 2143 delete tp.srcLine; 2144 delete tp.srcCol; 2145 delete [] testFile; 2146 #endif 2147 } 2148 2149 void RBBITest::TestThaiBreaks() { 2150 UErrorCode status=U_ZERO_ERROR; 2151 BreakIterator* b; 2152 Locale locale = Locale("th"); 2153 int32_t p, index; 2154 UChar c[]= { 2155 0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B, 2156 0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19, 2157 0x0E16, 0x0E49, 0x0E33 2158 }; 2159 int32_t expectedWordResult[] = { 2160 2, 3, 6, 10, 11, 15, 17, 20, 22 2161 }; 2162 int32_t expectedLineResult[] = { 2163 3, 6, 11, 15, 17, 20, 22 2164 }; 2165 int32_t size = sizeof(c)/sizeof(UChar); 2166 UnicodeString text=UnicodeString(c); 2167 2168 b = BreakIterator::createWordInstance(locale, status); 2169 if (U_FAILURE(status)) { 2170 errcheckln(status, "Unable to create thai word break iterator. - %s", u_errorName(status)); 2171 return; 2172 } 2173 b->setText(text); 2174 p = index = 0; 2175 while ((p=b->next())!=BreakIterator::DONE && p < size) { 2176 if (p != expectedWordResult[index++]) { 2177 errln("Incorrect break given by thai word break iterator. Expected: %d Got: %d", expectedWordResult[index-1], p); 2178 } 2179 } 2180 delete b; 2181 2182 b = BreakIterator::createLineInstance(locale, status); 2183 if (U_FAILURE(status)) { 2184 printf("Unable to create thai line break iterator.\n"); 2185 return; 2186 } 2187 b->setText(text); 2188 p = index = 0; 2189 while ((p=b->next())!=BreakIterator::DONE && p < size) { 2190 if (p != expectedLineResult[index++]) { 2191 errln("Incorrect break given by thai line break iterator. Expected: %d Got: %d", expectedLineResult[index-1], p); 2192 } 2193 } 2194 2195 delete b; 2196 } 2197 2198 // UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX" 2199 // Words don't include colon or period (cldrbug #1969). 2200 static const char posxWordText[] = "Can't have breaks in xx:yy or struct.field for CS-types."; 2201 static const int32_t posxWordTOffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 23, 24, 26, 27, 29, 30, 36, 37, 42, 43, 46, 47, 49, 50, 55, 56 }; 2202 static const int32_t posxWordROffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 26, 27, 29, 30, 42, 43, 46, 47, 49, 50, 55, 56 }; 2203 2204 // UBreakIteratorType UBRK_WORD, Locale "ja" 2205 // Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009). 2206 static const char jaWordText[] = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF" 2207 "\\u304C\\u3042\\u308B\\u3002\\u5948\\u3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002"; 2208 #if 0 2209 static const int32_t jaWordTOffsets[] = { 2, 3, 7, 8, 14, 17, 18, 20, 21, 24, 27, 28 }; 2210 static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28 }; 2211 #endif 2212 // There's no separate Japanese word break iterator. Root is the same as Japanese. 2213 // Our dictionary-based iterator has to be tweaked to better handle U+3005, 2214 // U+3007, U+300B and some other cases. 2215 static const int32_t jaWordTOffsets[] = { 1, 2, 3, 4, 5, 7, 8, 12, 13, 14, 15, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 }; 2216 static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 7, 8, 12, 13, 14, 15, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 }; 2217 2218 // UBreakIteratorType UBRK_SENTENCE, Locale "el" 2219 // Add break after Greek question mark (cldrbug #2069). 2220 static const char elSentText[] = "\\u0391\\u03B2, \\u03B3\\u03B4; \\u0395 \\u03B6\\u03B7\\u037E \\u0398 \\u03B9\\u03BA. " 2221 "\\u039B\\u03BC \\u03BD\\u03BE! \\u039F\\u03C0, \\u03A1\\u03C2? \\u03A3"; 2222 static const int32_t elSentTOffsets[] = { 8, 14, 20, 27, 35, 36 }; 2223 static const int32_t elSentROffsets[] = { 20, 27, 35, 36 }; 2224 2225 // UBreakIteratorType UBRK_CHARACTER, Locale "th" 2226 // Clusters should not include spacing Thai/Lao vowels (prefix or postfix), except for [SARA] AM (cldrbug #2161). 2227 static const char thCharText[] = "\\u0E01\\u0E23\\u0E30\\u0E17\\u0E48\\u0E2D\\u0E21\\u0E23\\u0E08\\u0E19\\u0E32 " 2228 "(\\u0E2A\\u0E38\\u0E0A\\u0E32\\u0E15\\u0E34-\\u0E08\\u0E38\\u0E11\\u0E32\\u0E21\\u0E32\\u0E28) " 2229 "\\u0E40\\u0E14\\u0E47\\u0E01\\u0E21\\u0E35\\u0E1B\\u0E31\\u0E0D\\u0E2B\\u0E32 "; 2230 static const int32_t thCharTOffsets[] = { 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 2231 12, 13, 15, 16, 17, 19, 20, 22, 23, 24, 25, 26, 27, 28, 2232 29, 30, 32, 33, 35, 37, 38, 39, 40, 41 }; 2233 static const int32_t thCharROffsets[] = { 1, 3, 5, 6, 7, 8, 9, 11, 2234 12, 13, 15, 17, 19, 20, 22, 24, 26, 27, 28, 2235 29, 32, 33, 35, 37, 38, 40, 41 }; 2236 2237 typedef struct { 2238 UBreakIteratorType type; 2239 const char * locale; 2240 const char * escapedText; 2241 const int32_t * tailoredOffsets; 2242 int32_t tailoredOffsetsCount; 2243 const int32_t * rootOffsets; 2244 int32_t rootOffsetsCount; 2245 } TailoredBreakItem; 2246 2247 #define ARRAY_PTR_LEN(array) (array),(sizeof(array)/sizeof(array[0])) 2248 2249 static const TailoredBreakItem tbItems[] = { 2250 { UBRK_WORD, "en_US_POSIX", posxWordText, ARRAY_PTR_LEN(posxWordTOffsets), ARRAY_PTR_LEN(posxWordROffsets) }, 2251 { UBRK_WORD, "ja", jaWordText, ARRAY_PTR_LEN(jaWordTOffsets), ARRAY_PTR_LEN(jaWordROffsets) }, 2252 { UBRK_SENTENCE, "el", elSentText, ARRAY_PTR_LEN(elSentTOffsets), ARRAY_PTR_LEN(elSentROffsets) }, 2253 { UBRK_CHARACTER, "th", thCharText, ARRAY_PTR_LEN(thCharTOffsets), ARRAY_PTR_LEN(thCharROffsets) }, 2254 { UBRK_CHARACTER, NULL, NULL, NULL,0, NULL,0 } // terminator 2255 }; 2256 2257 static void formatOffsets(char* buffer, int32_t buflen, int32_t count, const int32_t* offsets) { 2258 while (count-- > 0) { 2259 int writeCount; 2260 sprintf(buffer, /* buflen, */ " %d%n", *offsets++, &writeCount); /* wants to be snprintf */ 2261 buffer += writeCount; 2262 buflen -= writeCount; 2263 } 2264 } 2265 2266 enum { kMaxOffsetCount = 128 }; 2267 2268 void RBBITest::TBTest(BreakIterator* brkitr, int type, const char *locale, const char* escapedText, const int32_t *expectOffsets, int32_t expectOffsetsCount) { 2269 brkitr->setText( CharsToUnicodeString(escapedText) ); 2270 int32_t foundOffsets[kMaxOffsetCount]; 2271 int32_t offset, foundOffsetsCount = 0; 2272 // do forwards iteration test 2273 while ( foundOffsetsCount < kMaxOffsetCount && (offset = brkitr->next()) != BreakIterator::DONE ) { 2274 foundOffsets[foundOffsetsCount++] = offset; 2275 } 2276 if ( foundOffsetsCount != expectOffsetsCount || memcmp(expectOffsets, foundOffsets, foundOffsetsCount*sizeof(foundOffsets[0])) != 0 ) { 2277 // log error for forwards test 2278 char formatExpect[512], formatFound[512]; 2279 formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets); 2280 formatOffsets(formatFound, sizeof(formatFound), foundOffsetsCount, foundOffsets); 2281 errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found %d offsets fwd:%s\n", 2282 type, locale, escapedText, expectOffsetsCount, formatExpect, foundOffsetsCount, formatFound); 2283 } else { 2284 // do backwards iteration test 2285 --foundOffsetsCount; // back off one from the end offset 2286 while ( foundOffsetsCount > 0 ) { 2287 offset = brkitr->previous(); 2288 if ( offset != foundOffsets[--foundOffsetsCount] ) { 2289 // log error for backwards test 2290 char formatExpect[512]; 2291 formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets); 2292 errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found rev offset %d where expect %d\n", 2293 type, locale, escapedText, expectOffsetsCount, formatExpect, offset, foundOffsets[foundOffsetsCount]); 2294 break; 2295 } 2296 } 2297 } 2298 } 2299 2300 void RBBITest::TestTailoredBreaks() { 2301 const TailoredBreakItem * tbItemPtr; 2302 Locale rootLocale = Locale("root"); 2303 for (tbItemPtr = tbItems; tbItemPtr->escapedText != NULL; ++tbItemPtr) { 2304 Locale testLocale = Locale(tbItemPtr->locale); 2305 BreakIterator * tailoredBrkiter; 2306 BreakIterator * rootBrkiter; 2307 UErrorCode status = U_ZERO_ERROR; 2308 switch (tbItemPtr->type) { 2309 case UBRK_CHARACTER: 2310 tailoredBrkiter = BreakIterator::createCharacterInstance(testLocale, status); 2311 rootBrkiter = BreakIterator::createCharacterInstance(rootLocale, status); 2312 break; 2313 case UBRK_WORD: 2314 tailoredBrkiter = BreakIterator::createWordInstance(testLocale, status); 2315 rootBrkiter = BreakIterator::createWordInstance(rootLocale, status); 2316 break; 2317 case UBRK_LINE: 2318 tailoredBrkiter = BreakIterator::createLineInstance(testLocale, status); 2319 rootBrkiter = BreakIterator::createLineInstance(rootLocale, status); 2320 break; 2321 case UBRK_SENTENCE: 2322 tailoredBrkiter = BreakIterator::createSentenceInstance(testLocale, status); 2323 rootBrkiter = BreakIterator::createSentenceInstance(rootLocale, status); 2324 break; 2325 default: 2326 status = U_UNSUPPORTED_ERROR; 2327 break; 2328 } 2329 if (U_FAILURE(status)) { 2330 errcheckln(status, "BreakIterator create failed for type %d, locales root or %s - Error: %s", (int)(tbItemPtr->type), tbItemPtr->locale, u_errorName(status)); 2331 continue; 2332 } 2333 TBTest(tailoredBrkiter, (int)(tbItemPtr->type), tbItemPtr->locale, tbItemPtr->escapedText, tbItemPtr->tailoredOffsets, tbItemPtr->tailoredOffsetsCount); 2334 TBTest(rootBrkiter, (int)(tbItemPtr->type), "root", tbItemPtr->escapedText, tbItemPtr->rootOffsets, tbItemPtr->rootOffsetsCount); 2335 2336 delete rootBrkiter; 2337 delete tailoredBrkiter; 2338 } 2339 } 2340 2341 2342 //------------------------------------------------------------------------------- 2343 // 2344 // ReadAndConvertFile Read a text data file, convert it to UChars, and 2345 // return the datain one big UChar * buffer, which the caller must delete. 2346 // 2347 // parameters: 2348 // fileName: the name of the file, with no directory part. The test data directory 2349 // is assumed. 2350 // ulen an out parameter, receives the actual length (in UChars) of the file data. 2351 // encoding The file encoding. If the file contains a BOM, that will override the encoding 2352 // specified here. The BOM, if it exists, will be stripped from the returned data. 2353 // Pass NULL for the system default encoding. 2354 // status 2355 // returns: 2356 // The file data, converted to UChar. 2357 // The caller must delete this when done with 2358 // delete [] theBuffer; 2359 // 2360 // TODO: This is a clone of RegexTest::ReadAndConvertFile. 2361 // Move this function to some common place. 2362 // 2363 //-------------------------------------------------------------------------------- 2364 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) { 2365 UChar *retPtr = NULL; 2366 char *fileBuf = NULL; 2367 UConverter* conv = NULL; 2368 FILE *f = NULL; 2369 2370 ulen = 0; 2371 if (U_FAILURE(status)) { 2372 return retPtr; 2373 } 2374 2375 // 2376 // Open the file. 2377 // 2378 f = fopen(fileName, "rb"); 2379 if (f == 0) { 2380 dataerrln("Error opening test data file %s\n", fileName); 2381 status = U_FILE_ACCESS_ERROR; 2382 return NULL; 2383 } 2384 // 2385 // Read it in 2386 // 2387 int fileSize; 2388 int amt_read; 2389 2390 fseek( f, 0, SEEK_END); 2391 fileSize = ftell(f); 2392 fileBuf = new char[fileSize]; 2393 fseek(f, 0, SEEK_SET); 2394 amt_read = fread(fileBuf, 1, fileSize, f); 2395 if (amt_read != fileSize || fileSize <= 0) { 2396 errln("Error reading test data file."); 2397 goto cleanUpAndReturn; 2398 } 2399 2400 // 2401 // Look for a Unicode Signature (BOM) on the data just read 2402 // 2403 int32_t signatureLength; 2404 const char * fileBufC; 2405 const char* bomEncoding; 2406 2407 fileBufC = fileBuf; 2408 bomEncoding = ucnv_detectUnicodeSignature( 2409 fileBuf, fileSize, &signatureLength, &status); 2410 if(bomEncoding!=NULL ){ 2411 fileBufC += signatureLength; 2412 fileSize -= signatureLength; 2413 encoding = bomEncoding; 2414 } 2415 2416 // 2417 // Open a converter to take the rule file to UTF-16 2418 // 2419 conv = ucnv_open(encoding, &status); 2420 if (U_FAILURE(status)) { 2421 goto cleanUpAndReturn; 2422 } 2423 2424 // 2425 // Convert the rules to UChar. 2426 // Preflight first to determine required buffer size. 2427 // 2428 ulen = ucnv_toUChars(conv, 2429 NULL, // dest, 2430 0, // destCapacity, 2431 fileBufC, 2432 fileSize, 2433 &status); 2434 if (status == U_BUFFER_OVERFLOW_ERROR) { 2435 // Buffer Overflow is expected from the preflight operation. 2436 status = U_ZERO_ERROR; 2437 2438 retPtr = new UChar[ulen+1]; 2439 ucnv_toUChars(conv, 2440 retPtr, // dest, 2441 ulen+1, 2442 fileBufC, 2443 fileSize, 2444 &status); 2445 } 2446 2447 cleanUpAndReturn: 2448 fclose(f); 2449 delete []fileBuf; 2450 ucnv_close(conv); 2451 if (U_FAILURE(status)) { 2452 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 2453 delete retPtr; 2454 retPtr = 0; 2455 ulen = 0; 2456 }; 2457 return retPtr; 2458 } 2459 2460 2461 2462 //-------------------------------------------------------------------------------------------- 2463 // 2464 // Run tests from each of the boundary test data files distributed by the Unicode Consortium 2465 // 2466 //------------------------------------------------------------------------------------------- 2467 void RBBITest::TestUnicodeFiles() { 2468 RuleBasedBreakIterator *bi; 2469 UErrorCode status = U_ZERO_ERROR; 2470 2471 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getDefault(), status); 2472 TEST_ASSERT_SUCCESS(status); 2473 if (U_SUCCESS(status)) { 2474 runUnicodeTestData("GraphemeBreakTest.txt", bi); 2475 } 2476 delete bi; 2477 2478 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getDefault(), status); 2479 TEST_ASSERT_SUCCESS(status); 2480 if (U_SUCCESS(status)) { 2481 runUnicodeTestData("WordBreakTest.txt", bi); 2482 } 2483 delete bi; 2484 2485 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status); 2486 TEST_ASSERT_SUCCESS(status); 2487 if (U_SUCCESS(status)) { 2488 runUnicodeTestData("SentenceBreakTest.txt", bi); 2489 } 2490 delete bi; 2491 2492 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); 2493 TEST_ASSERT_SUCCESS(status); 2494 if (U_SUCCESS(status)) { 2495 runUnicodeTestData("LineBreakTest.txt", bi); 2496 } 2497 delete bi; 2498 } 2499 2500 2501 //-------------------------------------------------------------------------------------------- 2502 // 2503 // Run tests from one of the boundary test data files distributed by the Unicode Consortium 2504 // 2505 //------------------------------------------------------------------------------------------- 2506 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) { 2507 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 2508 UErrorCode status = U_ZERO_ERROR; 2509 2510 // 2511 // Open and read the test data file, put it into a UnicodeString. 2512 // 2513 const char *testDataDirectory = IntlTest::getSourceTestData(status); 2514 char testFileName[1000]; 2515 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) { 2516 dataerrln("Can't open test data. Path too long."); 2517 return; 2518 } 2519 strcpy(testFileName, testDataDirectory); 2520 strcat(testFileName, fileName); 2521 2522 logln("Opening data file %s\n", fileName); 2523 2524 int len; 2525 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status); 2526 if (status != U_FILE_ACCESS_ERROR) { 2527 TEST_ASSERT_SUCCESS(status); 2528 TEST_ASSERT(testFile != NULL); 2529 } 2530 if (U_FAILURE(status) || testFile == NULL) { 2531 return; /* something went wrong, error already output */ 2532 } 2533 UnicodeString testFileAsString(TRUE, testFile, len); 2534 2535 // 2536 // Parse the test data file using a regular expression. 2537 // Each kind of token is recognized in its own capture group; what type of item was scanned 2538 // is identified by which group had a match. 2539 // 2540 // Caputure Group # 1 2 3 4 5 2541 // Parses this item: divide x hex digits comment \n unrecognized \n 2542 // 2543 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV); 2544 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status); 2545 UnicodeString testString; 2546 UVector32 breakPositions(status); 2547 int lineNumber = 1; 2548 TEST_ASSERT_SUCCESS(status); 2549 if (U_FAILURE(status)) { 2550 return; 2551 } 2552 2553 // 2554 // Scan through each test case, building up the string to be broken in testString, 2555 // and the positions that should be boundaries in the breakPositions vector. 2556 // 2557 while (tokenMatcher.find()) { 2558 if (tokenMatcher.start(1, status) >= 0) { 2559 // Scanned a divide sign, indicating a break position in the test data. 2560 if (testString.length()>0) { 2561 breakPositions.addElement(testString.length(), status); 2562 } 2563 } 2564 else if (tokenMatcher.start(2, status) >= 0) { 2565 // Scanned an 'x', meaning no break at this position in the test data 2566 // Nothing to be done here. 2567 } 2568 else if (tokenMatcher.start(3, status) >= 0) { 2569 // Scanned Hex digits. Convert them to binary, append to the character data string. 2570 const UnicodeString &hexNumber = tokenMatcher.group(3, status); 2571 int length = hexNumber.length(); 2572 if (length<=8) { 2573 char buf[10]; 2574 hexNumber.extract (0, length, buf, sizeof(buf), US_INV); 2575 UChar32 c = (UChar32)strtol(buf, NULL, 16); 2576 if (c<=0x10ffff) { 2577 testString.append(c); 2578 } else { 2579 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n", 2580 fileName, lineNumber); 2581 } 2582 } else { 2583 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n", 2584 fileName, lineNumber); 2585 } 2586 } 2587 else if (tokenMatcher.start(4, status) >= 0) { 2588 // Scanned to end of a line, possibly skipping over a comment in the process. 2589 // If the line from the file contained test data, run the test now. 2590 // 2591 if (testString.length() > 0) { 2592 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi); 2593 } 2594 2595 // Clear out this test case. 2596 // The string and breakPositions vector will be refilled as the next 2597 // test case is parsed. 2598 testString.remove(); 2599 breakPositions.removeAllElements(); 2600 lineNumber++; 2601 } else { 2602 // Scanner catchall. Something unrecognized appeared on the line. 2603 char token[16]; 2604 UnicodeString uToken = tokenMatcher.group(0, status); 2605 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token)); 2606 token[sizeof(token)-1] = 0; 2607 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token); 2608 2609 // Clean up, in preparation for continuing with the next line. 2610 testString.remove(); 2611 breakPositions.removeAllElements(); 2612 lineNumber++; 2613 } 2614 TEST_ASSERT_SUCCESS(status); 2615 if (U_FAILURE(status)) { 2616 break; 2617 } 2618 } 2619 2620 delete [] testFile; 2621 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 2622 } 2623 2624 //-------------------------------------------------------------------------------------------- 2625 // 2626 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium 2627 // test data files. Do only a simple, forward-only check - 2628 // this test is mostly to check that ICU and the Unicode 2629 // data agree with each other. 2630 // 2631 //-------------------------------------------------------------------------------------------- 2632 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber, 2633 const UnicodeString &testString, // Text data to be broken 2634 UVector32 *breakPositions, // Positions where breaks should be found. 2635 RuleBasedBreakIterator *bi) { 2636 int32_t pos; // Break Position in the test string 2637 int32_t expectedI = 0; // Index of expected break position in the vector of expected results. 2638 int32_t expectedPos; // Expected break position (index into test string) 2639 2640 bi->setText(testString); 2641 pos = bi->first(); 2642 pos = bi->next(); 2643 2644 while (pos != BreakIterator::DONE) { 2645 if (expectedI >= breakPositions->size()) { 2646 errln("Test file \"%s\", line %d, unexpected break found at position %d", 2647 testFileName, lineNumber, pos); 2648 break; 2649 } 2650 expectedPos = breakPositions->elementAti(expectedI); 2651 if (pos < expectedPos) { 2652 errln("Test file \"%s\", line %d, unexpected break found at position %d", 2653 testFileName, lineNumber, pos); 2654 break; 2655 } 2656 if (pos > expectedPos) { 2657 errln("Test file \"%s\", line %d, failed to find expected break at position %d", 2658 testFileName, lineNumber, expectedPos); 2659 break; 2660 } 2661 pos = bi->next(); 2662 expectedI++; 2663 } 2664 2665 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) { 2666 errln("Test file \"%s\", line %d, failed to find expected break at position %d", 2667 testFileName, lineNumber, breakPositions->elementAti(expectedI)); 2668 } 2669 } 2670 2671 2672 2673 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 2674 //--------------------------------------------------------------------------------------- 2675 // 2676 // classs RBBIMonkeyKind 2677 // 2678 // Monkey Test for Break Iteration 2679 // Abstract interface class. Concrete derived classes independently 2680 // implement the break rules for different iterator types. 2681 // 2682 // The Monkey Test itself uses doesn't know which type of break iterator it is 2683 // testing, but works purely in terms of the interface defined here. 2684 // 2685 //--------------------------------------------------------------------------------------- 2686 class RBBIMonkeyKind { 2687 public: 2688 // Return a UVector of UnicodeSets, representing the character classes used 2689 // for this type of iterator. 2690 virtual UVector *charClasses() = 0; 2691 2692 // Set the test text on which subsequent calls to next() will operate 2693 virtual void setText(const UnicodeString &s) = 0; 2694 2695 // Find the next break postion, starting from the prev break position, or from zero. 2696 // Return -1 after reaching end of string. 2697 virtual int32_t next(int32_t i) = 0; 2698 2699 virtual ~RBBIMonkeyKind(); 2700 UErrorCode deferredStatus; 2701 2702 2703 protected: 2704 RBBIMonkeyKind(); 2705 2706 private: 2707 }; 2708 2709 RBBIMonkeyKind::RBBIMonkeyKind() { 2710 deferredStatus = U_ZERO_ERROR; 2711 } 2712 2713 RBBIMonkeyKind::~RBBIMonkeyKind() { 2714 } 2715 2716 2717 //---------------------------------------------------------------------------------------- 2718 // 2719 // Random Numbers. Similar to standard lib rand() and srand() 2720 // Not using library to 2721 // 1. Get same results on all platforms. 2722 // 2. Get access to current seed, to more easily reproduce failures. 2723 // 2724 //--------------------------------------------------------------------------------------- 2725 static uint32_t m_seed = 1; 2726 2727 static uint32_t m_rand() 2728 { 2729 m_seed = m_seed * 1103515245 + 12345; 2730 return (uint32_t)(m_seed/65536) % 32768; 2731 } 2732 2733 2734 //------------------------------------------------------------------------------------------ 2735 // 2736 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation 2737 // of RBBIMonkeyKind. 2738 // 2739 //------------------------------------------------------------------------------------------ 2740 class RBBICharMonkey: public RBBIMonkeyKind { 2741 public: 2742 RBBICharMonkey(); 2743 virtual ~RBBICharMonkey(); 2744 virtual UVector *charClasses(); 2745 virtual void setText(const UnicodeString &s); 2746 virtual int32_t next(int32_t i); 2747 private: 2748 UVector *fSets; 2749 2750 UnicodeSet *fCRLFSet; 2751 UnicodeSet *fControlSet; 2752 UnicodeSet *fExtendSet; 2753 UnicodeSet *fPrependSet; 2754 UnicodeSet *fSpacingSet; 2755 UnicodeSet *fLSet; 2756 UnicodeSet *fVSet; 2757 UnicodeSet *fTSet; 2758 UnicodeSet *fLVSet; 2759 UnicodeSet *fLVTSet; 2760 UnicodeSet *fHangulSet; 2761 UnicodeSet *fAnySet; 2762 2763 const UnicodeString *fText; 2764 }; 2765 2766 2767 RBBICharMonkey::RBBICharMonkey() { 2768 UErrorCode status = U_ZERO_ERROR; 2769 2770 fText = NULL; 2771 2772 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status); 2773 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status); 2774 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status); 2775 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status); 2776 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status); 2777 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status); 2778 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status); 2779 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status); 2780 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status); 2781 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status); 2782 fHangulSet = new UnicodeSet(); 2783 fHangulSet->addAll(*fLSet); 2784 fHangulSet->addAll(*fVSet); 2785 fHangulSet->addAll(*fTSet); 2786 fHangulSet->addAll(*fLVSet); 2787 fHangulSet->addAll(*fLVTSet); 2788 fAnySet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u0000-\\U0010ffff]"), status); 2789 2790 fSets = new UVector(status); 2791 fSets->addElement(fCRLFSet, status); 2792 fSets->addElement(fControlSet, status); 2793 fSets->addElement(fExtendSet, status); 2794 fSets->addElement(fPrependSet, status); 2795 fSets->addElement(fSpacingSet, status); 2796 fSets->addElement(fHangulSet, status); 2797 fSets->addElement(fAnySet, status); 2798 if (U_FAILURE(status)) { 2799 deferredStatus = status; 2800 } 2801 } 2802 2803 2804 void RBBICharMonkey::setText(const UnicodeString &s) { 2805 fText = &s; 2806 } 2807 2808 2809 2810 int32_t RBBICharMonkey::next(int32_t prevPos) { 2811 int p0, p1, p2, p3; // Indices of the significant code points around the 2812 // break position being tested. The candidate break 2813 // location is before p2. 2814 2815 int breakPos = -1; 2816 2817 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 2818 2819 if (U_FAILURE(deferredStatus)) { 2820 return -1; 2821 } 2822 2823 // Previous break at end of string. return DONE. 2824 if (prevPos >= fText->length()) { 2825 return -1; 2826 } 2827 p0 = p1 = p2 = p3 = prevPos; 2828 c3 = fText->char32At(prevPos); 2829 c0 = c1 = c2 = 0; 2830 2831 // Loop runs once per "significant" character position in the input text. 2832 for (;;) { 2833 // Move all of the positions forward in the input string. 2834 p0 = p1; c0 = c1; 2835 p1 = p2; c1 = c2; 2836 p2 = p3; c2 = c3; 2837 2838 // Advancd p3 by one codepoint 2839 p3 = fText->moveIndex32(p3, 1); 2840 c3 = fText->char32At(p3); 2841 2842 if (p1 == p2) { 2843 // Still warming up the loop. (won't work with zero length strings, but we don't care) 2844 continue; 2845 } 2846 if (p2 == fText->length()) { 2847 // Reached end of string. Always a break position. 2848 break; 2849 } 2850 2851 // Rule GB3 CR x LF 2852 // No Extend or Format characters may appear between the CR and LF, 2853 // which requires the additional check for p2 immediately following p1. 2854 // 2855 if (c1==0x0D && c2==0x0A && p1==(p2-1)) { 2856 continue; 2857 } 2858 2859 // Rule (GB4). ( Control | CR | LF ) <break> 2860 if (fControlSet->contains(c1) || 2861 c1 == 0x0D || 2862 c1 == 0x0A) { 2863 break; 2864 } 2865 2866 // Rule (GB5) <break> ( Control | CR | LF ) 2867 // 2868 if (fControlSet->contains(c2) || 2869 c2 == 0x0D || 2870 c2 == 0x0A) { 2871 break; 2872 } 2873 2874 2875 // Rule (GB6) L x ( L | V | LV | LVT ) 2876 if (fLSet->contains(c1) && 2877 (fLSet->contains(c2) || 2878 fVSet->contains(c2) || 2879 fLVSet->contains(c2) || 2880 fLVTSet->contains(c2))) { 2881 continue; 2882 } 2883 2884 // Rule (GB7) ( LV | V ) x ( V | T ) 2885 if ((fLVSet->contains(c1) || fVSet->contains(c1)) && 2886 (fVSet->contains(c2) || fTSet->contains(c2))) { 2887 continue; 2888 } 2889 2890 // Rule (GB8) ( LVT | T) x T 2891 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) && 2892 fTSet->contains(c2)) { 2893 continue; 2894 } 2895 2896 // Rule (GB9) Numeric x ALetter 2897 if (fExtendSet->contains(c2)) { 2898 continue; 2899 } 2900 2901 // Rule (GB9a) x SpacingMark 2902 if (fSpacingSet->contains(c2)) { 2903 continue; 2904 } 2905 2906 // Rule (GB9b) Prepend x 2907 if (fPrependSet->contains(c1)) { 2908 continue; 2909 } 2910 2911 // Rule (GB10) Any <break> Any 2912 break; 2913 } 2914 2915 breakPos = p2; 2916 return breakPos; 2917 } 2918 2919 2920 2921 UVector *RBBICharMonkey::charClasses() { 2922 return fSets; 2923 } 2924 2925 2926 RBBICharMonkey::~RBBICharMonkey() { 2927 delete fSets; 2928 delete fCRLFSet; 2929 delete fControlSet; 2930 delete fExtendSet; 2931 delete fPrependSet; 2932 delete fSpacingSet; 2933 delete fLSet; 2934 delete fVSet; 2935 delete fTSet; 2936 delete fLVSet; 2937 delete fLVTSet; 2938 delete fHangulSet; 2939 delete fAnySet; 2940 } 2941 2942 //------------------------------------------------------------------------------------------ 2943 // 2944 // class RBBIWordMonkey Word Break specific implementation 2945 // of RBBIMonkeyKind. 2946 // 2947 //------------------------------------------------------------------------------------------ 2948 class RBBIWordMonkey: public RBBIMonkeyKind { 2949 public: 2950 RBBIWordMonkey(); 2951 virtual ~RBBIWordMonkey(); 2952 virtual UVector *charClasses(); 2953 virtual void setText(const UnicodeString &s); 2954 virtual int32_t next(int32_t i); 2955 private: 2956 UVector *fSets; 2957 2958 UnicodeSet *fCRSet; 2959 UnicodeSet *fLFSet; 2960 UnicodeSet *fNewlineSet; 2961 UnicodeSet *fKatakanaSet; 2962 UnicodeSet *fALetterSet; 2963 // TODO(jungshik): Do we still need this change? 2964 // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt 2965 UnicodeSet *fMidNumLetSet; 2966 UnicodeSet *fMidLetterSet; 2967 UnicodeSet *fMidNumSet; 2968 UnicodeSet *fNumericSet; 2969 UnicodeSet *fFormatSet; 2970 UnicodeSet *fOtherSet; 2971 UnicodeSet *fExtendSet; 2972 UnicodeSet *fExtendNumLetSet; 2973 UnicodeSet *fDictionaryCjkSet; 2974 2975 RegexMatcher *fMatcher; 2976 2977 const UnicodeString *fText; 2978 }; 2979 2980 2981 RBBIWordMonkey::RBBIWordMonkey() 2982 { 2983 UErrorCode status = U_ZERO_ERROR; 2984 2985 fSets = new UVector(status); 2986 2987 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status); 2988 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status); 2989 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status); 2990 fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status); 2991 // Exclude Hangul syllables from ALetterSet during testing. 2992 // Leave CJK dictionary characters out from the monkey tests! 2993 #if 0 2994 fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}" 2995 "[\\p{Line_Break = Complex_Context}" 2996 "-\\p{Grapheme_Cluster_Break = Extend}" 2997 "-\\p{Grapheme_Cluster_Break = Control}" 2998 "]]", 2999 status); 3000 #endif 3001 fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status); 3002 fALetterSet->removeAll(*fDictionaryCjkSet); 3003 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status); 3004 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status); 3005 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status); 3006 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status); 3007 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}[\\uff10-\\uff19]]"), status); 3008 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status); 3009 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status); 3010 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status); 3011 3012 fOtherSet = new UnicodeSet(); 3013 if(U_FAILURE(status)) { 3014 deferredStatus = status; 3015 return; 3016 } 3017 3018 fOtherSet->complement(); 3019 fOtherSet->removeAll(*fCRSet); 3020 fOtherSet->removeAll(*fLFSet); 3021 fOtherSet->removeAll(*fNewlineSet); 3022 fOtherSet->removeAll(*fKatakanaSet); 3023 fOtherSet->removeAll(*fALetterSet); 3024 fOtherSet->removeAll(*fMidLetterSet); 3025 fOtherSet->removeAll(*fMidNumSet); 3026 fOtherSet->removeAll(*fNumericSet); 3027 fOtherSet->removeAll(*fExtendNumLetSet); 3028 fOtherSet->removeAll(*fFormatSet); 3029 fOtherSet->removeAll(*fExtendSet); 3030 // Inhibit dictionary characters from being tested at all. 3031 fOtherSet->removeAll(*fDictionaryCjkSet); 3032 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status)); 3033 3034 fSets->addElement(fCRSet, status); 3035 fSets->addElement(fLFSet, status); 3036 fSets->addElement(fNewlineSet, status); 3037 fSets->addElement(fALetterSet, status); 3038 //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test katakana 3039 fSets->addElement(fMidLetterSet, status); 3040 fSets->addElement(fMidNumLetSet, status); 3041 fSets->addElement(fMidNumSet, status); 3042 fSets->addElement(fNumericSet, status); 3043 fSets->addElement(fFormatSet, status); 3044 fSets->addElement(fExtendSet, status); 3045 fSets->addElement(fOtherSet, status); 3046 fSets->addElement(fExtendNumLetSet, status); 3047 3048 if (U_FAILURE(status)) { 3049 deferredStatus = status; 3050 } 3051 } 3052 3053 void RBBIWordMonkey::setText(const UnicodeString &s) { 3054 fText = &s; 3055 } 3056 3057 3058 int32_t RBBIWordMonkey::next(int32_t prevPos) { 3059 int p0, p1, p2, p3; // Indices of the significant code points around the 3060 // break position being tested. The candidate break 3061 // location is before p2. 3062 3063 int breakPos = -1; 3064 3065 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 3066 3067 if (U_FAILURE(deferredStatus)) { 3068 return -1; 3069 } 3070 3071 // Prev break at end of string. return DONE. 3072 if (prevPos >= fText->length()) { 3073 return -1; 3074 } 3075 p0 = p1 = p2 = p3 = prevPos; 3076 c3 = fText->char32At(prevPos); 3077 c0 = c1 = c2 = 0; 3078 3079 // Loop runs once per "significant" character position in the input text. 3080 for (;;) { 3081 // Move all of the positions forward in the input string. 3082 p0 = p1; c0 = c1; 3083 p1 = p2; c1 = c2; 3084 p2 = p3; c2 = c3; 3085 3086 // Advancd p3 by X(Extend | Format)* Rule 4 3087 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change) 3088 do { 3089 p3 = fText->moveIndex32(p3, 1); 3090 c3 = fText->char32At(p3); 3091 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { 3092 break; 3093 }; 3094 } 3095 while (fFormatSet->contains(c3) || fExtendSet->contains(c3)); 3096 3097 3098 if (p1 == p2) { 3099 // Still warming up the loop. (won't work with zero length strings, but we don't care) 3100 continue; 3101 } 3102 if (p2 == fText->length()) { 3103 // Reached end of string. Always a break position. 3104 break; 3105 } 3106 3107 // Rule (3) CR x LF 3108 // No Extend or Format characters may appear between the CR and LF, 3109 // which requires the additional check for p2 immediately following p1. 3110 // 3111 if (c1==0x0D && c2==0x0A) { 3112 continue; 3113 } 3114 3115 // Rule (3a) Break before and after newlines (including CR and LF) 3116 // 3117 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) { 3118 break; 3119 }; 3120 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { 3121 break; 3122 }; 3123 3124 // Rule (5). ALetter x ALetter 3125 if (fALetterSet->contains(c1) && 3126 fALetterSet->contains(c2)) { 3127 continue; 3128 } 3129 3130 // Rule (6) ALetter x (MidLetter | MidNumLet) ALetter 3131 // 3132 if ( fALetterSet->contains(c1) && 3133 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2)) && 3134 fALetterSet->contains(c3)) { 3135 continue; 3136 } 3137 3138 3139 // Rule (7) ALetter (MidLetter | MidNumLet) x ALetter 3140 if (fALetterSet->contains(c0) && 3141 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1)) && 3142 fALetterSet->contains(c2)) { 3143 continue; 3144 } 3145 3146 // Rule (8) Numeric x Numeric 3147 if (fNumericSet->contains(c1) && 3148 fNumericSet->contains(c2)) { 3149 continue; 3150 } 3151 3152 // Rule (9) ALetter x Numeric 3153 if (fALetterSet->contains(c1) && 3154 fNumericSet->contains(c2)) { 3155 continue; 3156 } 3157 3158 // Rule (10) Numeric x ALetter 3159 if (fNumericSet->contains(c1) && 3160 fALetterSet->contains(c2)) { 3161 continue; 3162 } 3163 3164 // Rule (11) Numeric (MidNum | MidNumLet) x Numeric 3165 if (fNumericSet->contains(c0) && 3166 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1)) && 3167 fNumericSet->contains(c2)) { 3168 continue; 3169 } 3170 3171 // Rule (12) Numeric x (MidNum | MidNumLet) Numeric 3172 if (fNumericSet->contains(c1) && 3173 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2)) && 3174 fNumericSet->contains(c3)) { 3175 continue; 3176 } 3177 3178 // Rule (13) Katakana x Katakana 3179 if (fKatakanaSet->contains(c1) && 3180 fKatakanaSet->contains(c2)) { 3181 continue; 3182 } 3183 3184 // Rule 13a 3185 if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) || 3186 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) && 3187 fExtendNumLetSet->contains(c2)) { 3188 continue; 3189 } 3190 3191 // Rule 13b 3192 if (fExtendNumLetSet->contains(c1) && 3193 (fALetterSet->contains(c2) || fNumericSet->contains(c2) || 3194 fKatakanaSet->contains(c2))) { 3195 continue; 3196 } 3197 3198 // Rule 14. Break found here. 3199 break; 3200 } 3201 3202 breakPos = p2; 3203 return breakPos; 3204 } 3205 3206 3207 UVector *RBBIWordMonkey::charClasses() { 3208 return fSets; 3209 } 3210 3211 3212 RBBIWordMonkey::~RBBIWordMonkey() { 3213 delete fSets; 3214 delete fCRSet; 3215 delete fLFSet; 3216 delete fNewlineSet; 3217 delete fKatakanaSet; 3218 delete fALetterSet; 3219 delete fMidNumLetSet; 3220 delete fMidLetterSet; 3221 delete fMidNumSet; 3222 delete fNumericSet; 3223 delete fFormatSet; 3224 delete fExtendSet; 3225 delete fExtendNumLetSet; 3226 delete fOtherSet; 3227 } 3228 3229 3230 3231 3232 //------------------------------------------------------------------------------------------ 3233 // 3234 // class RBBISentMonkey Sentence Break specific implementation 3235 // of RBBIMonkeyKind. 3236 // 3237 //------------------------------------------------------------------------------------------ 3238 class RBBISentMonkey: public RBBIMonkeyKind { 3239 public: 3240 RBBISentMonkey(); 3241 virtual ~RBBISentMonkey(); 3242 virtual UVector *charClasses(); 3243 virtual void setText(const UnicodeString &s); 3244 virtual int32_t next(int32_t i); 3245 private: 3246 int moveBack(int posFrom); 3247 int moveForward(int posFrom); 3248 UChar32 cAt(int pos); 3249 3250 UVector *fSets; 3251 3252 UnicodeSet *fSepSet; 3253 UnicodeSet *fFormatSet; 3254 UnicodeSet *fSpSet; 3255 UnicodeSet *fLowerSet; 3256 UnicodeSet *fUpperSet; 3257 UnicodeSet *fOLetterSet; 3258 UnicodeSet *fNumericSet; 3259 UnicodeSet *fATermSet; 3260 UnicodeSet *fSContinueSet; 3261 UnicodeSet *fSTermSet; 3262 UnicodeSet *fCloseSet; 3263 UnicodeSet *fOtherSet; 3264 UnicodeSet *fExtendSet; 3265 3266 const UnicodeString *fText; 3267 3268 }; 3269 3270 RBBISentMonkey::RBBISentMonkey() 3271 { 3272 UErrorCode status = U_ZERO_ERROR; 3273 3274 fSets = new UVector(status); 3275 3276 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator 3277 // set and made into character classes of their own. For the monkey impl, 3278 // they remain in SEP, since Sep always appears with CR and LF in the rules. 3279 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status); 3280 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status); 3281 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status); 3282 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status); 3283 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status); 3284 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status); 3285 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status); 3286 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status); 3287 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status); 3288 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status); 3289 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status); 3290 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status); 3291 fOtherSet = new UnicodeSet(); 3292 3293 if(U_FAILURE(status)) { 3294 deferredStatus = status; 3295 return; 3296 } 3297 3298 fOtherSet->complement(); 3299 fOtherSet->removeAll(*fSepSet); 3300 fOtherSet->removeAll(*fFormatSet); 3301 fOtherSet->removeAll(*fSpSet); 3302 fOtherSet->removeAll(*fLowerSet); 3303 fOtherSet->removeAll(*fUpperSet); 3304 fOtherSet->removeAll(*fOLetterSet); 3305 fOtherSet->removeAll(*fNumericSet); 3306 fOtherSet->removeAll(*fATermSet); 3307 fOtherSet->removeAll(*fSContinueSet); 3308 fOtherSet->removeAll(*fSTermSet); 3309 fOtherSet->removeAll(*fCloseSet); 3310 fOtherSet->removeAll(*fExtendSet); 3311 3312 fSets->addElement(fSepSet, status); 3313 fSets->addElement(fFormatSet, status); 3314 fSets->addElement(fSpSet, status); 3315 fSets->addElement(fLowerSet, status); 3316 fSets->addElement(fUpperSet, status); 3317 fSets->addElement(fOLetterSet, status); 3318 fSets->addElement(fNumericSet, status); 3319 fSets->addElement(fATermSet, status); 3320 fSets->addElement(fSContinueSet, status); 3321 fSets->addElement(fSTermSet, status); 3322 fSets->addElement(fCloseSet, status); 3323 fSets->addElement(fOtherSet, status); 3324 fSets->addElement(fExtendSet, status); 3325 3326 if (U_FAILURE(status)) { 3327 deferredStatus = status; 3328 } 3329 } 3330 3331 3332 3333 void RBBISentMonkey::setText(const UnicodeString &s) { 3334 fText = &s; 3335 } 3336 3337 UVector *RBBISentMonkey::charClasses() { 3338 return fSets; 3339 } 3340 3341 3342 // moveBack() Find the "significant" code point preceding the index i. 3343 // Skips over ($Extend | $Format)* . 3344 // 3345 int RBBISentMonkey::moveBack(int i) { 3346 if (i <= 0) { 3347 return -1; 3348 } 3349 UChar32 c; 3350 int32_t j = i; 3351 do { 3352 j = fText->moveIndex32(j, -1); 3353 c = fText->char32At(j); 3354 } 3355 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c))); 3356 return j; 3357 3358 } 3359 3360 3361 int RBBISentMonkey::moveForward(int i) { 3362 if (i>=fText->length()) { 3363 return fText->length(); 3364 } 3365 UChar32 c; 3366 int32_t j = i; 3367 do { 3368 j = fText->moveIndex32(j, 1); 3369 c = cAt(j); 3370 } 3371 while (fFormatSet->contains(c) || fExtendSet->contains(c)); 3372 return j; 3373 } 3374 3375 UChar32 RBBISentMonkey::cAt(int pos) { 3376 if (pos<0 || pos>=fText->length()) { 3377 return -1; 3378 } else { 3379 return fText->char32At(pos); 3380 } 3381 } 3382 3383 int32_t RBBISentMonkey::next(int32_t prevPos) { 3384 int p0, p1, p2, p3; // Indices of the significant code points around the 3385 // break position being tested. The candidate break 3386 // location is before p2. 3387 3388 int breakPos = -1; 3389 3390 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 3391 UChar32 c; 3392 3393 if (U_FAILURE(deferredStatus)) { 3394 return -1; 3395 } 3396 3397 // Prev break at end of string. return DONE. 3398 if (prevPos >= fText->length()) { 3399 return -1; 3400 } 3401 p0 = p1 = p2 = p3 = prevPos; 3402 c3 = fText->char32At(prevPos); 3403 c0 = c1 = c2 = 0; 3404 3405 // Loop runs once per "significant" character position in the input text. 3406 for (;;) { 3407 // Move all of the positions forward in the input string. 3408 p0 = p1; c0 = c1; 3409 p1 = p2; c1 = c2; 3410 p2 = p3; c2 = c3; 3411 3412 // Advancd p3 by X(Extend | Format)* Rule 4 3413 p3 = moveForward(p3); 3414 c3 = cAt(p3); 3415 3416 // Rule (3) CR x LF 3417 if (c1==0x0d && c2==0x0a && p2==(p1+1)) { 3418 continue; 3419 } 3420 3421 // Rule (4). Sep <break> 3422 if (fSepSet->contains(c1)) { 3423 p2 = p1+1; // Separators don't combine with Extend or Format. 3424 break; 3425 } 3426 3427 if (p2 >= fText->length()) { 3428 // Reached end of string. Always a break position. 3429 break; 3430 } 3431 3432 if (p2 == prevPos) { 3433 // Still warming up the loop. (won't work with zero length strings, but we don't care) 3434 continue; 3435 } 3436 3437 // Rule (6). ATerm x Numeric 3438 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) { 3439 continue; 3440 } 3441 3442 // Rule (7). Upper ATerm x Uppper 3443 if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) { 3444 continue; 3445 } 3446 3447 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower 3448 // Note: STerm | ATerm are added to the negated part of the expression by a 3449 // note to the Unicode 5.0 documents. 3450 int p8 = p1; 3451 while (fSpSet->contains(cAt(p8))) { 3452 p8 = moveBack(p8); 3453 } 3454 while (fCloseSet->contains(cAt(p8))) { 3455 p8 = moveBack(p8); 3456 } 3457 if (fATermSet->contains(cAt(p8))) { 3458 p8=p2; 3459 for (;;) { 3460 c = cAt(p8); 3461 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) || 3462 fLowerSet->contains(c) || fSepSet->contains(c) || 3463 fATermSet->contains(c) || fSTermSet->contains(c)) { 3464 break; 3465 } 3466 p8 = moveForward(p8); 3467 } 3468 if (fLowerSet->contains(cAt(p8))) { 3469 continue; 3470 } 3471 } 3472 3473 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm); 3474 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) { 3475 p8 = p1; 3476 while (fSpSet->contains(cAt(p8))) { 3477 p8 = moveBack(p8); 3478 } 3479 while (fCloseSet->contains(cAt(p8))) { 3480 p8 = moveBack(p8); 3481 } 3482 c = cAt(p8); 3483 if (fSTermSet->contains(c) || fATermSet->contains(c)) { 3484 continue; 3485 } 3486 } 3487 3488 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF) 3489 int p9 = p1; 3490 while (fCloseSet->contains(cAt(p9))) { 3491 p9 = moveBack(p9); 3492 } 3493 c = cAt(p9); 3494 if ((fSTermSet->contains(c) || fATermSet->contains(c))) { 3495 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) { 3496 continue; 3497 } 3498 } 3499 3500 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF) 3501 int p10 = p1; 3502 while (fSpSet->contains(cAt(p10))) { 3503 p10 = moveBack(p10); 3504 } 3505 while (fCloseSet->contains(cAt(p10))) { 3506 p10 = moveBack(p10); 3507 } 3508 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) { 3509 if (fSpSet->contains(c2) || fSepSet->contains(c2)) { 3510 continue; 3511 } 3512 } 3513 3514 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break> 3515 int p11 = p1; 3516 if (fSepSet->contains(cAt(p11))) { 3517 p11 = moveBack(p11); 3518 } 3519 while (fSpSet->contains(cAt(p11))) { 3520 p11 = moveBack(p11); 3521 } 3522 while (fCloseSet->contains(cAt(p11))) { 3523 p11 = moveBack(p11); 3524 } 3525 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) { 3526 break; 3527 } 3528 3529 // Rule (12) Any x Any 3530 continue; 3531 } 3532 breakPos = p2; 3533 return breakPos; 3534 } 3535 3536 RBBISentMonkey::~RBBISentMonkey() { 3537 delete fSets; 3538 delete fSepSet; 3539 delete fFormatSet; 3540 delete fSpSet; 3541 delete fLowerSet; 3542 delete fUpperSet; 3543 delete fOLetterSet; 3544 delete fNumericSet; 3545 delete fATermSet; 3546 delete fSContinueSet; 3547 delete fSTermSet; 3548 delete fCloseSet; 3549 delete fOtherSet; 3550 delete fExtendSet; 3551 } 3552 3553 3554 3555 //------------------------------------------------------------------------------------------- 3556 // 3557 // RBBILineMonkey 3558 // 3559 //------------------------------------------------------------------------------------------- 3560 3561 class RBBILineMonkey: public RBBIMonkeyKind { 3562 public: 3563 RBBILineMonkey(); 3564 virtual ~RBBILineMonkey(); 3565 virtual UVector *charClasses(); 3566 virtual void setText(const UnicodeString &s); 3567 virtual int32_t next(int32_t i); 3568 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar); 3569 private: 3570 UVector *fSets; 3571 3572 UnicodeSet *fBK; 3573 UnicodeSet *fCR; 3574 UnicodeSet *fLF; 3575 UnicodeSet *fCM; 3576 UnicodeSet *fNL; 3577 UnicodeSet *fSG; 3578 UnicodeSet *fWJ; 3579 UnicodeSet *fZW; 3580 UnicodeSet *fGL; 3581 UnicodeSet *fCB; 3582 UnicodeSet *fSP; 3583 UnicodeSet *fB2; 3584 UnicodeSet *fBA; 3585 UnicodeSet *fBB; 3586 UnicodeSet *fHY; 3587 UnicodeSet *fH2; 3588 UnicodeSet *fH3; 3589 UnicodeSet *fCL; 3590 UnicodeSet *fEX; 3591 UnicodeSet *fIN; 3592 UnicodeSet *fJL; 3593 UnicodeSet *fJV; 3594 UnicodeSet *fJT; 3595 UnicodeSet *fNS; 3596 UnicodeSet *fOP; 3597 UnicodeSet *fQU; 3598 UnicodeSet *fIS; 3599 UnicodeSet *fNU; 3600 UnicodeSet *fPO; 3601 UnicodeSet *fPR; 3602 UnicodeSet *fSY; 3603 UnicodeSet *fAI; 3604 UnicodeSet *fAL; 3605 UnicodeSet *fID; 3606 UnicodeSet *fSA; 3607 UnicodeSet *fXX; 3608 3609 BreakIterator *fCharBI; 3610 3611 const UnicodeString *fText; 3612 int32_t *fOrigPositions; 3613 3614 RegexMatcher *fNumberMatcher; 3615 RegexMatcher *fLB11Matcher; 3616 }; 3617 3618 3619 RBBILineMonkey::RBBILineMonkey() 3620 { 3621 UErrorCode status = U_ZERO_ERROR; 3622 3623 fSets = new UVector(status); 3624 3625 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status); 3626 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status); 3627 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status); 3628 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status); 3629 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status); 3630 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status); 3631 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status); 3632 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status); 3633 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status); 3634 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status); 3635 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status); 3636 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status); 3637 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status); 3638 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status); 3639 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status); 3640 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status); 3641 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status); 3642 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status); 3643 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status); 3644 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status); 3645 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status); 3646 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status); 3647 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status); 3648 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status); 3649 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status); 3650 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status); 3651 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status); 3652 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status); 3653 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status); 3654 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status); 3655 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status); 3656 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status); 3657 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status); 3658 fSA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status); 3659 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status); 3660 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status); 3661 3662 if (U_FAILURE(status)) { 3663 deferredStatus = status; 3664 fCharBI = NULL; 3665 fNumberMatcher = NULL; 3666 return; 3667 } 3668 3669 fAL->addAll(*fXX); // Default behavior for XX is identical to AL 3670 fAL->addAll(*fAI); // Default behavior for AI is identical to AL 3671 fAL->addAll(*fSA); // Default behavior for SA is XX, which defaults to AL 3672 fAL->addAll(*fSG); // Default behavior for SG is identical to AL. 3673 3674 fSets->addElement(fBK, status); 3675 fSets->addElement(fCR, status); 3676 fSets->addElement(fLF, status); 3677 fSets->addElement(fCM, status); 3678 fSets->addElement(fNL, status); 3679 fSets->addElement(fWJ, status); 3680 fSets->addElement(fZW, status); 3681 fSets->addElement(fGL, status); 3682 fSets->addElement(fCB, status); 3683 fSets->addElement(fSP, status); 3684 fSets->addElement(fB2, status); 3685 fSets->addElement(fBA, status); 3686 fSets->addElement(fBB, status); 3687 fSets->addElement(fHY, status); 3688 fSets->addElement(fH2, status); 3689 fSets->addElement(fH3, status); 3690 fSets->addElement(fCL, status); 3691 fSets->addElement(fEX, status); 3692 fSets->addElement(fIN, status); 3693 fSets->addElement(fJL, status); 3694 fSets->addElement(fJT, status); 3695 fSets->addElement(fJV, status); 3696 fSets->addElement(fNS, status); 3697 fSets->addElement(fOP, status); 3698 fSets->addElement(fQU, status); 3699 fSets->addElement(fIS, status); 3700 fSets->addElement(fNU, status); 3701 fSets->addElement(fPO, status); 3702 fSets->addElement(fPR, status); 3703 fSets->addElement(fSY, status); 3704 fSets->addElement(fAI, status); 3705 fSets->addElement(fAL, status); 3706 fSets->addElement(fID, status); 3707 fSets->addElement(fWJ, status); 3708 fSets->addElement(fSA, status); 3709 fSets->addElement(fSG, status); 3710 3711 const char *rules = 3712 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?" 3713 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?" 3714 "\\p{Line_Break=NU}\\p{Line_Break=CM}*" 3715 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*" 3716 "(\\p{Line_Break=CL}\\p{Line_Break=CM}*)?" 3717 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"; 3718 3719 fNumberMatcher = new RegexMatcher( 3720 UnicodeString(rules, -1, US_INV), 0, status); 3721 3722 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status); 3723 3724 if (U_FAILURE(status)) { 3725 deferredStatus = status; 3726 } 3727 } 3728 3729 3730 void RBBILineMonkey::setText(const UnicodeString &s) { 3731 fText = &s; 3732 fCharBI->setText(s); 3733 fNumberMatcher->reset(s); 3734 } 3735 3736 // 3737 // rule9Adjust 3738 // Line Break TR rules 9 and 10 implementation. 3739 // This deals with combining marks and other sequences that 3740 // that must be treated as if they were something other than what they actually are. 3741 // 3742 // This is factored out into a separate function because it must be applied twice for 3743 // each potential break, once to the chars before the position being checked, then 3744 // again to the text following the possible break. 3745 // 3746 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) { 3747 if (pos == -1) { 3748 // Invalid initial position. Happens during the warmup iteration of the 3749 // main loop in next(). 3750 return; 3751 } 3752 3753 int32_t nPos = *nextPos; 3754 3755 // LB 9 Keep combining sequences together. 3756 // advance over any CM class chars. Note that Line Break CM is different 3757 // from the normal Grapheme Extend property. 3758 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d || 3759 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) { 3760 for (;;) { 3761 *nextChar = fText->char32At(nPos); 3762 if (!fCM->contains(*nextChar)) { 3763 break; 3764 } 3765 nPos = fText->moveIndex32(nPos, 1); 3766 } 3767 } 3768 3769 3770 // LB 9 Treat X CM* as if it were x. 3771 // No explicit action required. 3772 3773 // LB 10 Treat any remaining combining mark as AL 3774 if (fCM->contains(*posChar)) { 3775 *posChar = 0x41; // thisChar = 'A'; 3776 } 3777 3778 // Push the updated nextPos and nextChar back to our caller. 3779 // This only makes a difference if posChar got bigger by consuming a 3780 // combining sequence. 3781 *nextPos = nPos; 3782 *nextChar = fText->char32At(nPos); 3783 } 3784 3785 3786 3787 int32_t RBBILineMonkey::next(int32_t startPos) { 3788 UErrorCode status = U_ZERO_ERROR; 3789 int32_t pos; // Index of the char following a potential break position 3790 UChar32 thisChar; // Character at above position "pos" 3791 3792 int32_t prevPos; // Index of the char preceding a potential break position 3793 UChar32 prevChar; // Character at above position. Note that prevChar 3794 // and thisChar may not be adjacent because combining 3795 // characters between them will be ignored. 3796 3797 int32_t nextPos; // Index of the next character following pos. 3798 // Usually skips over combining marks. 3799 int32_t nextCPPos; // Index of the code point following "pos." 3800 // May point to a combining mark. 3801 int32_t tPos; // temp value. 3802 UChar32 c; 3803 3804 if (U_FAILURE(deferredStatus)) { 3805 return -1; 3806 } 3807 3808 if (startPos >= fText->length()) { 3809 return -1; 3810 } 3811 3812 3813 // Initial values for loop. Loop will run the first time without finding breaks, 3814 // while the invalid values shift out and the "this" and 3815 // "prev" positions are filled in with good values. 3816 pos = prevPos = -1; // Invalid value, serves as flag for initial loop iteration. 3817 thisChar = prevChar = 0; 3818 nextPos = nextCPPos = startPos; 3819 3820 3821 // Loop runs once per position in the test text, until a break position 3822 // is found. 3823 for (;;) { 3824 prevPos = pos; 3825 prevChar = thisChar; 3826 3827 pos = nextPos; 3828 thisChar = fText->char32At(pos); 3829 3830 nextCPPos = fText->moveIndex32(pos, 1); 3831 nextPos = nextCPPos; 3832 3833 // Rule LB2 - Break at end of text. 3834 if (pos >= fText->length()) { 3835 break; 3836 } 3837 3838 // Rule LB 9 - adjust for combining sequences. 3839 // We do this one out-of-order because the adjustment does not change anything 3840 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to 3841 // be applied. 3842 rule9Adjust(prevPos, &prevChar, &pos, &thisChar); 3843 nextCPPos = nextPos = fText->moveIndex32(pos, 1); 3844 c = fText->char32At(nextPos); 3845 rule9Adjust(pos, &thisChar, &nextPos, &c); 3846 3847 // If the loop is still warming up - if we haven't shifted the initial 3848 // -1 positions out of prevPos yet - loop back to advance the 3849 // position in the input without any further looking for breaks. 3850 if (prevPos == -1) { 3851 continue; 3852 } 3853 3854 // LB 4 Always break after hard line breaks, 3855 if (fBK->contains(prevChar)) { 3856 break; 3857 } 3858 3859 // LB 5 Break after CR, LF, NL, but not inside CR LF 3860 if (prevChar == 0x0d && thisChar == 0x0a) { 3861 continue; 3862 } 3863 if (prevChar == 0x0d || 3864 prevChar == 0x0a || 3865 prevChar == 0x85) { 3866 break; 3867 } 3868 3869 // LB 6 Don't break before hard line breaks 3870 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 || 3871 fBK->contains(thisChar)) { 3872 continue; 3873 } 3874 3875 3876 // LB 7 Don't break before spaces or zero-width space. 3877 if (fSP->contains(thisChar)) { 3878 continue; 3879 } 3880 3881 if (fZW->contains(thisChar)) { 3882 continue; 3883 } 3884 3885 // LB 8 Break after zero width space 3886 if (fZW->contains(prevChar)) { 3887 break; 3888 } 3889 3890 // LB 9, 10 Already done, at top of loop. 3891 // 3892 3893 3894 // LB 11 Do not break before or after WORD JOINER and related characters. 3895 // x WJ 3896 // WJ x 3897 // 3898 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) { 3899 continue; 3900 } 3901 3902 // LB 12 3903 // GL x 3904 if (fGL->contains(prevChar)) { 3905 continue; 3906 } 3907 3908 // LB 12a 3909 // [^SP BA HY] x GL 3910 if (!(fSP->contains(prevChar) || 3911 fBA->contains(prevChar) || 3912 fHY->contains(prevChar) ) && fGL->contains(thisChar)) { 3913 continue; 3914 } 3915 3916 3917 3918 // LB 13 Don't break before closings. 3919 // NU x CL and NU x IS are not matched here so that they will 3920 // fall into LB 17 and the more general number regular expression. 3921 // 3922 if (!fNU->contains(prevChar) && fCL->contains(thisChar) || 3923 fEX->contains(thisChar) || 3924 !fNU->contains(prevChar) && fIS->contains(thisChar) || 3925 !fNU->contains(prevChar) && fSY->contains(thisChar)) { 3926 continue; 3927 } 3928 3929 // LB 14 Don't break after OP SP* 3930 // Scan backwards, checking for this sequence. 3931 // The OP char could include combining marks, so we actually check for 3932 // OP CM* SP* 3933 // Another Twist: The Rule 67 fixes may have changed a SP CM 3934 // sequence into a ID char, so before scanning back through spaces, 3935 // verify that prevChar is indeed a space. The prevChar variable 3936 // may differ from fText[prevPos] 3937 tPos = prevPos; 3938 if (fSP->contains(prevChar)) { 3939 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) { 3940 tPos=fText->moveIndex32(tPos, -1); 3941 } 3942 } 3943 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { 3944 tPos=fText->moveIndex32(tPos, -1); 3945 } 3946 if (fOP->contains(fText->char32At(tPos))) { 3947 continue; 3948 } 3949 3950 3951 // LB 15 QU SP* x OP 3952 if (fOP->contains(thisChar)) { 3953 // Scan backwards from prevChar to see if it is preceded by QU CM* SP* 3954 int tPos = prevPos; 3955 while (tPos>0 && fSP->contains(fText->char32At(tPos))) { 3956 tPos = fText->moveIndex32(tPos, -1); 3957 } 3958 while (tPos>0 && fCM->contains(fText->char32At(tPos))) { 3959 tPos = fText->moveIndex32(tPos, -1); 3960 } 3961 if (fQU->contains(fText->char32At(tPos))) { 3962 continue; 3963 } 3964 } 3965 3966 3967 3968 // LB 16 CL SP* x NS 3969 // Scan backwards for SP* CM* CL 3970 if (fNS->contains(thisChar)) { 3971 int tPos = prevPos; 3972 while (tPos>0 && fSP->contains(fText->char32At(tPos))) { 3973 tPos = fText->moveIndex32(tPos, -1); 3974 } 3975 while (tPos>0 && fCM->contains(fText->char32At(tPos))) { 3976 tPos = fText->moveIndex32(tPos, -1); 3977 } 3978 if (fCL->contains(fText->char32At(tPos))) { 3979 continue; 3980 } 3981 } 3982 3983 3984 // LB 17 B2 SP* x B2 3985 if (fB2->contains(thisChar)) { 3986 // Scan backwards, checking for the B2 CM* SP* sequence. 3987 tPos = prevPos; 3988 if (fSP->contains(prevChar)) { 3989 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) { 3990 tPos=fText->moveIndex32(tPos, -1); 3991 } 3992 } 3993 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { 3994 tPos=fText->moveIndex32(tPos, -1); 3995 } 3996 if (fB2->contains(fText->char32At(tPos))) { 3997 continue; 3998 } 3999 } 4000 4001 4002 // LB 18 break after space 4003 if (fSP->contains(prevChar)) { 4004 break; 4005 } 4006 4007 // LB 19 4008 // x QU 4009 // QU x 4010 if (fQU->contains(thisChar) || fQU->contains(prevChar)) { 4011 continue; 4012 } 4013 4014 // LB 20 Break around a CB 4015 if (fCB->contains(thisChar) || fCB->contains(prevChar)) { 4016 break; 4017 } 4018 4019 // LB 21 4020 if (fBA->contains(thisChar) || 4021 fHY->contains(thisChar) || 4022 fNS->contains(thisChar) || 4023 fBB->contains(prevChar) ) { 4024 continue; 4025 } 4026 4027 // LB 22 4028 if (fAL->contains(prevChar) && fIN->contains(thisChar) || 4029 fID->contains(prevChar) && fIN->contains(thisChar) || 4030 fIN->contains(prevChar) && fIN->contains(thisChar) || 4031 fNU->contains(prevChar) && fIN->contains(thisChar) ) { 4032 continue; 4033 } 4034 4035 4036 // LB 23 ID x PO 4037 // AL x NU 4038 // NU x AL 4039 if (fID->contains(prevChar) && fPO->contains(thisChar) || 4040 fAL->contains(prevChar) && fNU->contains(thisChar) || 4041 fNU->contains(prevChar) && fAL->contains(thisChar) ) { 4042 continue; 4043 } 4044 4045 // LB 24 Do not break between prefix and letters or ideographs. 4046 // PR x ID 4047 // PR x AL 4048 // PO x AL 4049 if (fPR->contains(prevChar) && fID->contains(thisChar) || 4050 fPR->contains(prevChar) && fAL->contains(thisChar) || 4051 fPO->contains(prevChar) && fAL->contains(thisChar) ) { 4052 continue; 4053 } 4054 4055 4056 4057 // LB 25 Numbers 4058 if (fNumberMatcher->lookingAt(prevPos, status)) { 4059 if (U_FAILURE(status)) { 4060 break; 4061 } 4062 // Matched a number. But could have been just a single digit, which would 4063 // not represent a "no break here" between prevChar and thisChar 4064 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num 4065 if (numEndIdx > pos) { 4066 // Number match includes at least our two chars being checked 4067 if (numEndIdx > nextPos) { 4068 // Number match includes additional chars. Update pos and nextPos 4069 // so that next loop iteration will continue at the end of the number, 4070 // checking for breaks between last char in number & whatever follows. 4071 pos = nextPos = numEndIdx; 4072 do { 4073 pos = fText->moveIndex32(pos, -1); 4074 thisChar = fText->char32At(pos); 4075 } while (fCM->contains(thisChar)); 4076 } 4077 continue; 4078 } 4079 } 4080 4081 4082 // LB 26 Do not break a Korean syllable. 4083 if (fJL->contains(prevChar) && (fJL->contains(thisChar) || 4084 fJV->contains(thisChar) || 4085 fH2->contains(thisChar) || 4086 fH3->contains(thisChar))) { 4087 continue; 4088 } 4089 4090 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) && 4091 (fJV->contains(thisChar) || fJT->contains(thisChar))) { 4092 continue; 4093 } 4094 4095 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) && 4096 fJT->contains(thisChar)) { 4097 continue; 4098 } 4099 4100 // LB 27 Treat a Korean Syllable Block the same as ID. 4101 if ((fJL->contains(prevChar) || fJV->contains(prevChar) || 4102 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) && 4103 fIN->contains(thisChar)) { 4104 continue; 4105 } 4106 if ((fJL->contains(prevChar) || fJV->contains(prevChar) || 4107 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) && 4108 fPO->contains(thisChar)) { 4109 continue; 4110 } 4111 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) || 4112 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) { 4113 continue; 4114 } 4115 4116 4117 4118 // LB 28 Do not break between alphabetics ("at"). 4119 if (fAL->contains(prevChar) && fAL->contains(thisChar)) { 4120 continue; 4121 } 4122 4123 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g."). 4124 if (fIS->contains(prevChar) && fAL->contains(thisChar)) { 4125 continue; 4126 } 4127 4128 // LB 31 Break everywhere else 4129 break; 4130 4131 } 4132 4133 return pos; 4134 } 4135 4136 4137 UVector *RBBILineMonkey::charClasses() { 4138 return fSets; 4139 } 4140 4141 4142 RBBILineMonkey::~RBBILineMonkey() { 4143 delete fSets; 4144 4145 delete fBK; 4146 delete fCR; 4147 delete fLF; 4148 delete fCM; 4149 delete fNL; 4150 delete fWJ; 4151 delete fZW; 4152 delete fGL; 4153 delete fCB; 4154 delete fSP; 4155 delete fB2; 4156 delete fBA; 4157 delete fBB; 4158 delete fHY; 4159 delete fH2; 4160 delete fH3; 4161 delete fCL; 4162 delete fEX; 4163 delete fIN; 4164 delete fJL; 4165 delete fJV; 4166 delete fJT; 4167 delete fNS; 4168 delete fOP; 4169 delete fQU; 4170 delete fIS; 4171 delete fNU; 4172 delete fPO; 4173 delete fPR; 4174 delete fSY; 4175 delete fAI; 4176 delete fAL; 4177 delete fID; 4178 delete fSA; 4179 delete fSG; 4180 delete fXX; 4181 4182 delete fCharBI; 4183 delete fNumberMatcher; 4184 } 4185 4186 4187 //------------------------------------------------------------------------------------------- 4188 // 4189 // TestMonkey 4190 // 4191 // params 4192 // seed=nnnnn Random number starting seed. 4193 // Setting the seed allows errors to be reproduced. 4194 // loop=nnn Looping count. Controls running time. 4195 // -1: run forever. 4196 // 0 or greater: run length. 4197 // 4198 // type = char | word | line | sent | title 4199 // 4200 //------------------------------------------------------------------------------------------- 4201 4202 static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) { 4203 int32_t val = defaultVal; 4204 name.append(" *= *(-?\\d+)"); 4205 UErrorCode status = U_ZERO_ERROR; 4206 RegexMatcher m(name, params, 0, status); 4207 if (m.find()) { 4208 // The param exists. Convert the string to an int. 4209 char valString[100]; 4210 int32_t paramLength = m.end(1, status) - m.start(1, status); 4211 if (paramLength >= (int32_t)(sizeof(valString)-1)) { 4212 paramLength = (int32_t)(sizeof(valString)-2); 4213 } 4214 params.extract(m.start(1, status), paramLength, valString, sizeof(valString)); 4215 val = strtol(valString, NULL, 10); 4216 4217 // Delete this parameter from the params string. 4218 m.reset(); 4219 params = m.replaceFirst("", status); 4220 } 4221 U_ASSERT(U_SUCCESS(status)); 4222 return val; 4223 } 4224 #endif 4225 4226 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr, 4227 BreakIterator *bi, 4228 int expected[], 4229 int expectedcount) 4230 { 4231 int count = 0; 4232 int i = 0; 4233 int forward[50]; 4234 bi->setText(ustr); 4235 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) { 4236 forward[count] = i; 4237 if (count < expectedcount && expected[count] != i) { 4238 test->errln("break forward test failed: expected %d but got %d", 4239 expected[count], i); 4240 break; 4241 } 4242 count ++; 4243 } 4244 if (count != expectedcount) { 4245 printStringBreaks(ustr, expected, expectedcount); 4246 test->errln("break forward test failed: missed %d match", 4247 expectedcount - count); 4248 return; 4249 } 4250 // testing boundaries 4251 for (i = 1; i < expectedcount; i ++) { 4252 int j = expected[i - 1]; 4253 if (!bi->isBoundary(j)) { 4254 printStringBreaks(ustr, expected, expectedcount); 4255 test->errln("isBoundary() failed. Expected boundary at position %d", j); 4256 return; 4257 } 4258 for (j = expected[i - 1] + 1; j < expected[i]; j ++) { 4259 if (bi->isBoundary(j)) { 4260 printStringBreaks(ustr, expected, expectedcount); 4261 test->errln("isBoundary() failed. Not expecting boundary at position %d", j); 4262 return; 4263 } 4264 } 4265 } 4266 4267 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) { 4268 count --; 4269 if (forward[count] != i) { 4270 printStringBreaks(ustr, expected, expectedcount); 4271 test->errln("happy break test previous() failed: expected %d but got %d", 4272 forward[count], i); 4273 break; 4274 } 4275 } 4276 if (count != 0) { 4277 printStringBreaks(ustr, expected, expectedcount); 4278 test->errln("break test previous() failed: missed a match"); 4279 return; 4280 } 4281 4282 // testing preceding 4283 for (i = 0; i < expectedcount - 1; i ++) { 4284 // int j = expected[i] + 1; 4285 int j = ustr.moveIndex32(expected[i], 1); 4286 for (; j <= expected[i + 1]; j ++) { 4287 if (bi->preceding(j) != expected[i]) { 4288 printStringBreaks(ustr, expected, expectedcount); 4289 test->errln("preceding(): Not expecting boundary at position %d", j); 4290 return; 4291 } 4292 } 4293 } 4294 } 4295 4296 void RBBITest::TestWordBreaks(void) 4297 { 4298 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 4299 4300 Locale locale("en"); 4301 UErrorCode status = U_ZERO_ERROR; 4302 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 4303 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 4304 // Replaced any C+J characters in a row with a random sequence of characters 4305 // of the same length to make our C+J segmentation not get in the way. 4306 static const char *strlist[] = 4307 { 4308 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d", 4309 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b", 4310 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a", 4311 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622", 4312 "\\uac00\\u3588\\u009c\\u0953\\u194b", 4313 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 4314 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e", 4315 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e", 4316 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 4317 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 4318 "\\u2027\\U000e0067\\u0a47\\u00b7", 4319 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 4320 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 4321 "\\u0589\\U000e006e\\u0a42\\U000104a5", 4322 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a", 4323 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 4324 "\\u0027\\u11af\\U000e0057\\u0602", 4325 "\\U0001d7f2\\U000e007\\u0004\\u0589", 4326 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b", 4327 "\\U0001d7f2\\U000e007d\\u0004\\u0589", 4328 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 4329 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 4330 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068", 4331 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 4332 "\\u0233\\U000e0020\\u0a69\\u0d6a", 4333 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 4334 "\\u18f4\\U000e0049\\u20e7\\u2027", 4335 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 4336 "\\ua183\\u102d\\u0bec\\u003a", 4337 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 4338 "\\u003a\\u0e57\\u0fad\\u002e", 4339 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", 4340 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", 4341 "\\U000e005d\\u2044\\u0731\\u0650\\u0061", 4342 "\\u003a\\u0664\\u00b7\\u1fba", 4343 "\\u003b\\u0027\\u00b7\\u47a3", 4344 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b", 4345 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673", 4346 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c", 4347 }; 4348 int loop; 4349 if (U_FAILURE(status)) { 4350 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 4351 return; 4352 } 4353 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 4354 // printf("looping %d\n", loop); 4355 UnicodeString ustr = CharsToUnicodeString(strlist[loop]); 4356 // RBBICharMonkey monkey; 4357 RBBIWordMonkey monkey; 4358 4359 int expected[50]; 4360 int expectedcount = 0; 4361 4362 monkey.setText(ustr); 4363 int i; 4364 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 4365 expected[expectedcount ++] = i; 4366 } 4367 4368 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 4369 } 4370 delete bi; 4371 #endif 4372 } 4373 4374 void RBBITest::TestWordBoundary(void) 4375 { 4376 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data> 4377 Locale locale("en"); 4378 UErrorCode status = U_ZERO_ERROR; 4379 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 4380 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 4381 UChar str[50]; 4382 static const char *strlist[] = 4383 { 4384 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 4385 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 4386 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 4387 "\\u2027\\U000e0067\\u0a47\\u00b7", 4388 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 4389 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 4390 "\\u0589\\U000e006e\\u0a42\\U000104a5", 4391 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a", 4392 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 4393 "\\u0027\\u11af\\U000e0057\\u0602", 4394 "\\U0001d7f2\\U000e007\\u0004\\u0589", 4395 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b", 4396 "\\U0001d7f2\\U000e007d\\u0004\\u0589", 4397 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 4398 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 4399 "\\U000e0065\\u302c\\u09ee\\U000e0068", 4400 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 4401 "\\u0233\\U000e0020\\u0a69\\u0d6a", 4402 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 4403 "\\u58f4\\U000e0049\\u20e7\\u2027", 4404 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 4405 "\\ua183\\u102d\\u0bec\\u003a", 4406 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 4407 "\\u003a\\u0e57\\u0fad\\u002e", 4408 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", 4409 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", 4410 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019", 4411 "\\u003a\\u0664\\u00b7\\u1fba", 4412 "\\u003b\\u0027\\u00b7\\u47a3", 4413 }; 4414 int loop; 4415 if (U_FAILURE(status)) { 4416 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 4417 return; 4418 } 4419 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 4420 // printf("looping %d\n", loop); 4421 u_unescape(strlist[loop], str, 20); 4422 UnicodeString ustr(str); 4423 int forward[50]; 4424 int count = 0; 4425 4426 bi->setText(ustr); 4427 int prev = 0; 4428 int i; 4429 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) { 4430 forward[count ++] = i; 4431 if (i > prev) { 4432 int j; 4433 for (j = prev + 1; j < i; j ++) { 4434 if (bi->isBoundary(j)) { 4435 printStringBreaks(ustr, forward, count); 4436 errln("happy boundary test failed: expected %d not a boundary", 4437 j); 4438 return; 4439 } 4440 } 4441 } 4442 if (!bi->isBoundary(i)) { 4443 printStringBreaks(ustr, forward, count); 4444 errln("happy boundary test failed: expected %d a boundary", 4445 i); 4446 return; 4447 } 4448 prev = i; 4449 } 4450 } 4451 delete bi; 4452 } 4453 4454 void RBBITest::TestLineBreaks(void) 4455 { 4456 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 4457 Locale locale("en"); 4458 UErrorCode status = U_ZERO_ERROR; 4459 BreakIterator *bi = BreakIterator::createLineInstance(locale, status); 4460 const int32_t STRSIZE = 50; 4461 UChar str[STRSIZE]; 4462 static const char *strlist[] = 4463 { 4464 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc", 4465 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\" 4466 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d", 4467 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\" 4468 "u2014\\U000e0105\\u118c\\u000a\\u07f8", 4469 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f", 4470 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", 4471 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4", 4472 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", 4473 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060", 4474 "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5", 4475 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f", 4476 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1", 4477 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5", 4478 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0", 4479 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc", 4480 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f", 4481 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f", 4482 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b", 4483 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085", 4484 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac", 4485 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9", 4486 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025", 4487 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763", 4488 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029", 4489 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7", 4490 "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc", 4491 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a", 4492 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945", 4493 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014", 4494 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b", 4495 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0", 4496 "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025", 4497 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d", 4498 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111", 4499 "\\u2014\\u0020\\u000a\\u17c5\\u24fc", 4500 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f", 4501 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010", 4502 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43", 4503 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb", 4504 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc", 4505 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060", 4506 "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d" 4507 "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5" 4508 "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b", 4509 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0", 4510 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07", 4511 }; 4512 int loop; 4513 TEST_ASSERT_SUCCESS(status); 4514 if (U_FAILURE(status)) { 4515 return; 4516 } 4517 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 4518 // printf("looping %d\n", loop); 4519 int32_t t = u_unescape(strlist[loop], str, STRSIZE); 4520 if (t >= STRSIZE) { 4521 TEST_ASSERT(FALSE); 4522 continue; 4523 } 4524 4525 4526 UnicodeString ustr(str); 4527 RBBILineMonkey monkey; 4528 if (U_FAILURE(monkey.deferredStatus)) { 4529 continue; 4530 } 4531 4532 const int EXPECTEDSIZE = 50; 4533 int expected[EXPECTEDSIZE]; 4534 int expectedcount = 0; 4535 4536 monkey.setText(ustr); 4537 int i; 4538 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 4539 if (expectedcount >= EXPECTEDSIZE) { 4540 TEST_ASSERT(expectedcount < EXPECTEDSIZE); 4541 return; 4542 } 4543 expected[expectedcount ++] = i; 4544 } 4545 4546 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 4547 } 4548 delete bi; 4549 #endif 4550 } 4551 4552 void RBBITest::TestSentBreaks(void) 4553 { 4554 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 4555 Locale locale("en"); 4556 UErrorCode status = U_ZERO_ERROR; 4557 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status); 4558 UChar str[200]; 4559 static const char *strlist[] = 4560 { 4561 "Now\ris\nthe\r\ntime\n\rfor\r\r", 4562 "This\n", 4563 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.", 4564 "\"Sentence ending with a quote.\" Bye.", 4565 " (This is it). Testing the sentence iterator. \"This isn't it.\"", 4566 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"", 4567 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ", 4568 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ", 4569 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ", 4570 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!", 4571 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52" 4572 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a" 4573 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f" 4574 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352", 4575 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171" 4576 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030" 4577 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b" 4578 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b" 4579 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05" 4580 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4" 4581 }; 4582 int loop; 4583 if (U_FAILURE(status)) { 4584 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 4585 return; 4586 } 4587 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 4588 u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0]))); 4589 UnicodeString ustr(str); 4590 4591 RBBISentMonkey monkey; 4592 if (U_FAILURE(monkey.deferredStatus)) { 4593 continue; 4594 } 4595 4596 const int EXPECTEDSIZE = 50; 4597 int expected[EXPECTEDSIZE]; 4598 int expectedcount = 0; 4599 4600 monkey.setText(ustr); 4601 int i; 4602 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 4603 if (expectedcount >= EXPECTEDSIZE) { 4604 TEST_ASSERT(expectedcount < EXPECTEDSIZE); 4605 return; 4606 } 4607 expected[expectedcount ++] = i; 4608 } 4609 4610 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 4611 } 4612 delete bi; 4613 #endif 4614 } 4615 4616 void RBBITest::TestMonkey(char *params) { 4617 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 4618 4619 UErrorCode status = U_ZERO_ERROR; 4620 int32_t loopCount = 500; 4621 int32_t seed = 1; 4622 UnicodeString breakType = "all"; 4623 Locale locale("en"); 4624 UBool useUText = FALSE; 4625 4626 if (quick == FALSE) { 4627 loopCount = 10000; 4628 } 4629 4630 if (params) { 4631 UnicodeString p(params); 4632 loopCount = getIntParam("loop", p, loopCount); 4633 seed = getIntParam("seed", p, seed); 4634 4635 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status); 4636 if (m.find()) { 4637 breakType = m.group(1, status); 4638 m.reset(); 4639 p = m.replaceFirst("", status); 4640 } 4641 4642 RegexMatcher u(" *utext", p, 0, status); 4643 if (u.find()) { 4644 useUText = TRUE; 4645 u.reset(); 4646 p = u.replaceFirst("", status); 4647 } 4648 4649 4650 // m.reset(p); 4651 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) { 4652 // Each option is stripped out of the option string as it is processed. 4653 // All options have been checked. The option string should have been completely emptied.. 4654 char buf[100]; 4655 p.extract(buf, sizeof(buf), NULL, status); 4656 buf[sizeof(buf)-1] = 0; 4657 errln("Unrecognized or extra parameter: %s\n", buf); 4658 return; 4659 } 4660 4661 } 4662 4663 if (breakType == "char" || breakType == "all") { 4664 RBBICharMonkey m; 4665 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 4666 if (U_SUCCESS(status)) { 4667 RunMonkey(bi, m, "char", seed, loopCount, useUText); 4668 if (breakType == "all" && useUText==FALSE) { 4669 // Also run a quick test with UText when "all" is specified 4670 RunMonkey(bi, m, "char", seed, loopCount, TRUE); 4671 } 4672 } 4673 else { 4674 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status)); 4675 } 4676 delete bi; 4677 } 4678 4679 if (breakType == "word" || breakType == "all") { 4680 logln("Word Break Monkey Test"); 4681 RBBIWordMonkey m; 4682 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 4683 if (U_SUCCESS(status)) { 4684 RunMonkey(bi, m, "word", seed, loopCount, useUText); 4685 } 4686 else { 4687 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status)); 4688 } 4689 delete bi; 4690 } 4691 4692 if (breakType == "line" || breakType == "all") { 4693 logln("Line Break Monkey Test"); 4694 RBBILineMonkey m; 4695 BreakIterator *bi = BreakIterator::createLineInstance(locale, status); 4696 if (loopCount >= 10) { 4697 loopCount = loopCount / 5; // Line break runs slower than the others. 4698 } 4699 if (U_SUCCESS(status)) { 4700 RunMonkey(bi, m, "line", seed, loopCount, useUText); 4701 } 4702 else { 4703 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); 4704 } 4705 delete bi; 4706 } 4707 4708 if (breakType == "sent" || breakType == "all" ) { 4709 logln("Sentence Break Monkey Test"); 4710 RBBISentMonkey m; 4711 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status); 4712 if (loopCount >= 10) { 4713 loopCount = loopCount / 10; // Sentence runs slower than the other break types 4714 } 4715 if (U_SUCCESS(status)) { 4716 RunMonkey(bi, m, "sentence", seed, loopCount, useUText); 4717 } 4718 else { 4719 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); 4720 } 4721 delete bi; 4722 } 4723 4724 #endif 4725 } 4726 4727 // 4728 // Run a RBBI monkey test. Common routine, for all break iterator types. 4729 // Parameters: 4730 // bi - the break iterator to use 4731 // mk - MonkeyKind, abstraction for obtaining expected results 4732 // name - Name of test (char, word, etc.) for use in error messages 4733 // seed - Seed for starting random number generator (parameter from user) 4734 // numIterations 4735 // 4736 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed, 4737 int32_t numIterations, UBool useUText) { 4738 4739 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 4740 4741 const int32_t TESTSTRINGLEN = 500; 4742 UnicodeString testText; 4743 int32_t numCharClasses; 4744 UVector *chClasses; 4745 int expected[TESTSTRINGLEN*2 + 1]; 4746 int expectedCount = 0; 4747 char expectedBreaks[TESTSTRINGLEN*2 + 1]; 4748 char forwardBreaks[TESTSTRINGLEN*2 + 1]; 4749 char reverseBreaks[TESTSTRINGLEN*2+1]; 4750 char isBoundaryBreaks[TESTSTRINGLEN*2+1]; 4751 char followingBreaks[TESTSTRINGLEN*2+1]; 4752 char precedingBreaks[TESTSTRINGLEN*2+1]; 4753 int i; 4754 int loopCount = 0; 4755 4756 m_seed = seed; 4757 4758 numCharClasses = mk.charClasses()->size(); 4759 chClasses = mk.charClasses(); 4760 4761 // Check for errors that occured during the construction of the MonkeyKind object. 4762 // Can't report them where they occured because errln() is a method coming from intlTest, 4763 // and is not visible outside of RBBITest :-( 4764 if (U_FAILURE(mk.deferredStatus)) { 4765 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus)); 4766 return; 4767 } 4768 4769 // Verify that the character classes all have at least one member. 4770 for (i=0; i<numCharClasses; i++) { 4771 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i); 4772 if (s == NULL || s->size() == 0) { 4773 errln("Character Class #%d is null or of zero size.", i); 4774 return; 4775 } 4776 } 4777 4778 while (loopCount < numIterations || numIterations == -1) { 4779 if (numIterations == -1 && loopCount % 10 == 0) { 4780 // If test is running in an infinite loop, display a periodic tic so 4781 // we can tell that it is making progress. 4782 fprintf(stderr, "."); 4783 } 4784 // Save current random number seed, so that we can recreate the random numbers 4785 // for this loop iteration in event of an error. 4786 seed = m_seed; 4787 4788 // Populate a test string with data. 4789 testText.truncate(0); 4790 for (i=0; i<TESTSTRINGLEN; i++) { 4791 int32_t aClassNum = m_rand() % numCharClasses; 4792 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum); 4793 int32_t charIdx = m_rand() % classSet->size(); 4794 UChar32 c = classSet->charAt(charIdx); 4795 if (c < 0) { // TODO: deal with sets containing strings. 4796 errln("c < 0"); 4797 break; 4798 } 4799 testText.append(c); 4800 } 4801 4802 // Calculate the expected results for this test string. 4803 mk.setText(testText); 4804 memset(expectedBreaks, 0, sizeof(expectedBreaks)); 4805 expectedBreaks[0] = 1; 4806 int32_t breakPos = 0; 4807 expectedCount = 0; 4808 for (;;) { 4809 breakPos = mk.next(breakPos); 4810 if (breakPos == -1) { 4811 break; 4812 } 4813 if (breakPos > testText.length()) { 4814 errln("breakPos > testText.length()"); 4815 } 4816 expectedBreaks[breakPos] = 1; 4817 U_ASSERT(expectedCount<testText.length()); 4818 expected[expectedCount ++] = breakPos; 4819 } 4820 4821 // Find the break positions using forward iteration 4822 memset(forwardBreaks, 0, sizeof(forwardBreaks)); 4823 if (useUText) { 4824 UErrorCode status = U_ZERO_ERROR; 4825 UText *testUText = utext_openReplaceable(NULL, &testText, &status); 4826 // testUText = utext_openUnicodeString(testUText, &testText, &status); 4827 bi->setText(testUText, status); 4828 TEST_ASSERT_SUCCESS(status); 4829 utext_close(testUText); // The break iterator does a shallow clone of the UText 4830 // This UText can be closed immediately, so long as the 4831 // testText string continues to exist. 4832 } else { 4833 bi->setText(testText); 4834 } 4835 4836 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) { 4837 if (i < 0 || i > testText.length()) { 4838 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name); 4839 break; 4840 } 4841 forwardBreaks[i] = 1; 4842 } 4843 4844 // Find the break positions using reverse iteration 4845 memset(reverseBreaks, 0, sizeof(reverseBreaks)); 4846 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) { 4847 if (i < 0 || i > testText.length()) { 4848 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name); 4849 break; 4850 } 4851 reverseBreaks[i] = 1; 4852 } 4853 4854 // Find the break positions using isBoundary() tests. 4855 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks)); 4856 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length()); 4857 for (i=0; i<=testText.length(); i++) { 4858 isBoundaryBreaks[i] = bi->isBoundary(i); 4859 } 4860 4861 4862 // Find the break positions using the following() function. 4863 // printf("."); 4864 memset(followingBreaks, 0, sizeof(followingBreaks)); 4865 int32_t lastBreakPos = 0; 4866 followingBreaks[0] = 1; 4867 for (i=0; i<testText.length(); i++) { 4868 breakPos = bi->following(i); 4869 if (breakPos <= i || 4870 breakPos < lastBreakPos || 4871 breakPos > testText.length() || 4872 breakPos > lastBreakPos && lastBreakPos > i ) { 4873 errln("%s break monkey test: " 4874 "Out of range value returned by BreakIterator::following().\n" 4875 "Random seed=%d index=%d; following returned %d; lastbreak=%d", 4876 name, seed, i, breakPos, lastBreakPos); 4877 break; 4878 } 4879 followingBreaks[breakPos] = 1; 4880 lastBreakPos = breakPos; 4881 } 4882 4883 // Find the break positions using the preceding() function. 4884 memset(precedingBreaks, 0, sizeof(precedingBreaks)); 4885 lastBreakPos = testText.length(); 4886 precedingBreaks[testText.length()] = 1; 4887 for (i=testText.length(); i>0; i--) { 4888 breakPos = bi->preceding(i); 4889 if (breakPos >= i || 4890 breakPos > lastBreakPos || 4891 breakPos < 0 && testText.getChar32Start(i)>0 || 4892 breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i) ) { 4893 errln("%s break monkey test: " 4894 "Out of range value returned by BreakIterator::preceding().\n" 4895 "index=%d; prev returned %d; lastBreak=%d" , 4896 name, i, breakPos, lastBreakPos); 4897 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) { 4898 precedingBreaks[i] = 2; // Forces an error. 4899 } 4900 } else { 4901 if (breakPos >= 0) { 4902 precedingBreaks[breakPos] = 1; 4903 } 4904 lastBreakPos = breakPos; 4905 } 4906 } 4907 4908 // Compare the expected and actual results. 4909 for (i=0; i<=testText.length(); i++) { 4910 const char *errorType = NULL; 4911 if (forwardBreaks[i] != expectedBreaks[i]) { 4912 errorType = "next()"; 4913 } else if (reverseBreaks[i] != forwardBreaks[i]) { 4914 errorType = "previous()"; 4915 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) { 4916 errorType = "isBoundary()"; 4917 } else if (followingBreaks[i] != expectedBreaks[i]) { 4918 errorType = "following()"; 4919 } else if (precedingBreaks[i] != expectedBreaks[i]) { 4920 errorType = "preceding()"; 4921 } 4922 4923 4924 if (errorType != NULL) { 4925 // Format a range of the test text that includes the failure as 4926 // a data item that can be included in the rbbi test data file. 4927 4928 // Start of the range is the last point where expected and actual results 4929 // both agreed that there was a break position. 4930 int startContext = i; 4931 int32_t count = 0; 4932 for (;;) { 4933 if (startContext==0) { break; } 4934 startContext --; 4935 if (expectedBreaks[startContext] != 0) { 4936 if (count == 2) break; 4937 count ++; 4938 } 4939 } 4940 4941 // End of range is two expected breaks past the start position. 4942 int endContext = i + 1; 4943 int ci; 4944 for (ci=0; ci<2; ci++) { // Number of items to include in error text. 4945 for (;;) { 4946 if (endContext >= testText.length()) {break;} 4947 if (expectedBreaks[endContext-1] != 0) { 4948 if (count == 0) break; 4949 count --; 4950 } 4951 endContext ++; 4952 } 4953 } 4954 4955 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>" 4956 UnicodeString errorText = "<data>"; 4957 /***if (strcmp(errorType, "next()") == 0) { 4958 startContext = 0; 4959 endContext = testText.length(); 4960 4961 printStringBreaks(testText, expected, expectedCount); 4962 }***/ 4963 4964 for (ci=startContext; ci<endContext;) { 4965 UnicodeString hexChars("0123456789abcdef"); 4966 UChar32 c; 4967 int bn; 4968 c = testText.char32At(ci); 4969 if (ci == i) { 4970 // This is the location of the error. 4971 errorText.append("<?>"); 4972 } else if (expectedBreaks[ci] != 0) { 4973 // This a non-error expected break position. 4974 errorText.append("\\"); 4975 } 4976 if (c < 0x10000) { 4977 errorText.append("\\u"); 4978 for (bn=12; bn>=0; bn-=4) { 4979 errorText.append(hexChars.charAt((c>>bn)&0xf)); 4980 } 4981 } else { 4982 errorText.append("\\U"); 4983 for (bn=28; bn>=0; bn-=4) { 4984 errorText.append(hexChars.charAt((c>>bn)&0xf)); 4985 } 4986 } 4987 ci = testText.moveIndex32(ci, 1); 4988 } 4989 errorText.append("\\"); 4990 errorText.append("</data>\n"); 4991 4992 // Output the error 4993 char charErrorTxt[500]; 4994 UErrorCode status = U_ZERO_ERROR; 4995 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status); 4996 charErrorTxt[sizeof(charErrorTxt)-1] = 0; 4997 errln("%s break monkey test error. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s", 4998 name, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"), 4999 errorType, seed, i, charErrorTxt); 5000 break; 5001 } 5002 } 5003 5004 loopCount++; 5005 } 5006 #endif 5007 } 5008 5009 // 5010 // TestDebug - A place-holder test for debugging purposes. 5011 // For putting in fragments of other tests that can be invoked 5012 // for tracing without a lot of unwanted extra stuff happening. 5013 // 5014 void RBBITest::TestDebug(void) { 5015 #if 0 5016 UErrorCode status = U_ZERO_ERROR; 5017 int pos = 0; 5018 int ruleStatus = 0; 5019 5020 RuleBasedBreakIterator* bi = 5021 // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); 5022 // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status); 5023 (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status); 5024 UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e"); 5025 // UnicodeString s("Aaa. Bcd"); 5026 s = s.unescape(); 5027 bi->setText(s); 5028 UBool r = bi->isBoundary(8); 5029 printf("%s", r?"true":"false"); 5030 return; 5031 pos = bi->last(); 5032 do { 5033 // ruleStatus = bi->getRuleStatus(); 5034 printf("%d\t%d\n", pos, ruleStatus); 5035 pos = bi->previous(); 5036 } while (pos != BreakIterator::DONE); 5037 #endif 5038 } 5039 5040 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 5041