1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 * Created on May 5, 2004 6 * 7 * Copyright (C) 2004-2016 International Business Machines Corporation and others. 8 * All Rights Reserved. 9 * 10 */ 11 package android.icu.dev.test.rbbi; 12 13 import java.io.IOException; 14 import java.io.InputStream; 15 import java.io.InputStreamReader; 16 import java.util.Arrays; 17 18 import org.junit.Test; 19 import org.junit.runner.RunWith; 20 import org.junit.runners.JUnit4; 21 22 import android.icu.dev.test.TestFmwk; 23 import android.icu.impl.Utility; 24 import android.icu.lang.UCharacter; 25 import android.icu.text.BreakIterator; 26 import android.icu.text.RuleBasedBreakIterator; 27 import android.icu.util.ULocale; 28 import android.icu.testsharding.MainTestShard; 29 30 31 /** 32 * Rule based break iterator data driven test. 33 * Perform the tests from the file rbbitst.txt. 34 * The test data file is common to both ICU4C and ICU4J. 35 * See the data file for a description of the tests. 36 * 37 */ 38 @MainTestShard 39 @RunWith(JUnit4.class) 40 public class RBBITestExtended extends TestFmwk { 41 public RBBITestExtended() { 42 } 43 44 45 46 static class TestParams { 47 BreakIterator bi; 48 StringBuilder dataToBreak = new StringBuilder(); 49 int[] expectedBreaks = new int[4000]; 50 int[] srcLine = new int[4000]; 51 int[] srcCol = new int[4000]; 52 ULocale currentLocale = new ULocale("en_US"); 53 } 54 55 56 @Test 57 public void TestExtended() { 58 TestParams tp = new TestParams(); 59 60 61 // 62 // Open and read the test data file. 63 // 64 StringBuilder testFileBuf = new StringBuilder(); 65 InputStream is = null; 66 try { 67 is = RBBITestExtended.class.getResourceAsStream("rbbitst.txt"); 68 if (is == null) { 69 errln("Could not open test data file rbbitst.txt"); 70 return; 71 } 72 InputStreamReader isr = new InputStreamReader(is, "UTF-8"); 73 try { 74 int c; 75 int count = 0; 76 for (;;) { 77 c = isr.read(); 78 if (c < 0) { 79 break; 80 } 81 count++; 82 if (c == 0xFEFF && count == 1) { 83 // BOM in the test data file. Discard it. 84 continue; 85 } 86 87 testFileBuf.appendCodePoint(c); 88 } 89 } finally { 90 isr.close(); 91 } 92 } catch (IOException e) { 93 errln(e.toString()); 94 try { 95 is.close(); 96 } catch (IOException ignored) { 97 } 98 return; 99 } 100 101 String testString = testFileBuf.toString(); 102 103 104 final int PARSE_COMMENT = 1; 105 final int PARSE_TAG = 2; 106 final int PARSE_DATA = 3; 107 final int PARSE_NUM = 4; 108 final int PARSE_RULES = 5; 109 110 int parseState = PARSE_TAG; 111 112 int savedState = PARSE_TAG; 113 114 int lineNum = 1; 115 int colStart = 0; 116 int column = 0; 117 int charIdx = 0; 118 int i; 119 120 int tagValue = 0; // The numeric value of a <nnn> tag. 121 122 StringBuilder rules = new StringBuilder(); // Holds rules from a <rules> ... </rules> block 123 int rulesFirstLine = 0; // Line number of the start of current <rules> block 124 125 int len = testString.length(); 126 127 for (charIdx = 0; charIdx < len; ) { 128 int c = testString.codePointAt(charIdx); 129 charIdx++; 130 if (c == '\r' && charIdx<len && testString.charAt(charIdx) == '\n') { 131 // treat CRLF as a unit 132 c = '\n'; 133 charIdx++; 134 } 135 if (c == '\n' || c == '\r') { 136 lineNum++; 137 colStart = charIdx; 138 } 139 column = charIdx - colStart + 1; 140 141 switch (parseState) { 142 case PARSE_COMMENT: 143 if (c == 0x0a || c == 0x0d) { 144 parseState = savedState; 145 } 146 break; 147 148 case PARSE_TAG: 149 { 150 if (c == '#') { 151 parseState = PARSE_COMMENT; 152 savedState = PARSE_TAG; 153 break; 154 } 155 if (UCharacter.isWhitespace(c)) { 156 break; 157 } 158 if (testString.startsWith("<word>", charIdx-1)) { 159 tp.bi = BreakIterator.getWordInstance(tp.currentLocale); 160 charIdx += 5; 161 break; 162 } 163 if (testString.startsWith("<char>", charIdx-1)) { 164 tp.bi = BreakIterator.getCharacterInstance(tp.currentLocale); 165 charIdx += 5; 166 break; 167 } 168 if (testString.startsWith("<line>", charIdx-1)) { 169 tp.bi = BreakIterator.getLineInstance(tp.currentLocale); 170 charIdx += 5; 171 break; 172 } 173 if (testString.startsWith("<sent>", charIdx-1)) { 174 tp.bi = BreakIterator.getSentenceInstance(tp.currentLocale); 175 charIdx += 5; 176 break; 177 } 178 if (testString.startsWith("<title>", charIdx-1)) { 179 tp.bi = BreakIterator.getTitleInstance(tp.currentLocale); 180 charIdx += 6; 181 break; 182 } 183 if (testString.startsWith("<rules>", charIdx-1) || 184 testString.startsWith("<badrules>", charIdx-1)) { 185 charIdx = testString.indexOf('>', charIdx) + 1; 186 parseState = PARSE_RULES; 187 rules.setLength(0); 188 rulesFirstLine = lineNum; 189 break; 190 } 191 192 if (testString.startsWith("<locale ", charIdx-1)) { 193 int closeIndex = testString.indexOf(">", charIdx); 194 if (closeIndex < 0) { 195 errln("line" + lineNum + ": missing close on <locale tag."); 196 break; 197 } 198 String localeName = testString.substring(charIdx+6, closeIndex); 199 localeName = localeName.trim(); 200 tp.currentLocale = new ULocale(localeName); 201 charIdx = closeIndex+1; 202 break; 203 } 204 if (testString.startsWith("<data>", charIdx-1)) { 205 parseState = PARSE_DATA; 206 charIdx += 5; 207 tp.dataToBreak.setLength(0); 208 Arrays.fill(tp.expectedBreaks, 0); 209 Arrays.fill(tp.srcCol, 0); 210 Arrays.fill(tp.srcLine, 0); 211 break; 212 } 213 214 errln("line" + lineNum + ": Tag expected in test file."); 215 return; 216 //parseState = PARSE_COMMENT; 217 //savedState = PARSE_DATA; 218 } 219 220 case PARSE_RULES: 221 if (testString.startsWith("</rules>", charIdx-1)) { 222 charIdx += 7; 223 parseState = PARSE_TAG; 224 try { 225 tp.bi = new RuleBasedBreakIterator(rules.toString()); 226 } catch (IllegalArgumentException e) { 227 errln(String.format("rbbitst.txt:%d Error creating break iterator from rules. %s", lineNum, e)); 228 } 229 } else if (testString.startsWith("</badrules>", charIdx-1)) { 230 charIdx += 10; 231 parseState = PARSE_TAG; 232 boolean goodRules = true; 233 try { 234 new RuleBasedBreakIterator(rules.toString()); 235 } catch (IllegalArgumentException e) { 236 goodRules = false; 237 } 238 if (goodRules) { 239 errln(String.format( 240 "rbbitst.txt:%d Expected, but did not get, a failure creating break iterator from rules.", 241 lineNum)); 242 } 243 } else { 244 rules.appendCodePoint(c); 245 } 246 break; 247 248 case PARSE_DATA: 249 if (c == '') { 250 int breakIdx = tp.dataToBreak.length(); 251 tp.expectedBreaks[breakIdx] = -1; 252 tp.srcLine[breakIdx] = lineNum; 253 tp.srcCol[breakIdx] = column; 254 break; 255 } 256 257 if (testString.startsWith("</data>", charIdx-1)) { 258 // Add final entry to mappings from break location to source file position. 259 // Need one extra because last break position returned is after the 260 // last char in the data, not at the last char. 261 int idx = tp.dataToBreak.length(); 262 tp.srcLine[idx] = lineNum; 263 tp.srcCol[idx] = column; 264 265 parseState = PARSE_TAG; 266 charIdx += 6; 267 268 // RUN THE TEST! 269 executeTest(tp); 270 break; 271 } 272 273 if (testString.startsWith("\\N{", charIdx-1)) { 274 int nameEndIdx = testString.indexOf('}', charIdx); 275 if (nameEndIdx == -1) { 276 errln("Error in named character in test file at line " + lineNum + 277 ", col " + column); 278 } 279 // Named character, e.g. \N{COMBINING GRAVE ACCENT} 280 // Get the code point from the name and insert it into the test data. 281 String charName = testString.substring(charIdx+2, nameEndIdx); 282 c = UCharacter.getCharFromName(charName); 283 if (c == -1) { 284 errln("Error in named character in test file at line " + lineNum + 285 ", col " + column); 286 } else { 287 // Named code point was recognized. Insert it 288 // into the test data. 289 tp.dataToBreak.appendCodePoint(c); 290 for (i = tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) { 291 tp.srcLine[i] = lineNum; 292 tp.srcCol[i] = column; 293 } 294 295 } 296 if (nameEndIdx > charIdx) { 297 charIdx = nameEndIdx+1; 298 } 299 break; 300 } 301 302 if (testString.startsWith("<>", charIdx-1)) { 303 charIdx++; 304 int breakIdx = tp.dataToBreak.length(); 305 tp.expectedBreaks[breakIdx] = -1; 306 tp.srcLine[breakIdx] = lineNum; 307 tp.srcCol[breakIdx] = column; 308 break; 309 } 310 311 if (c == '<') { 312 tagValue = 0; 313 parseState = PARSE_NUM; 314 break; 315 } 316 317 if (c == '#' && column==3) { // TODO: why is column off so far? 318 parseState = PARSE_COMMENT; 319 savedState = PARSE_DATA; 320 break; 321 } 322 323 if (c == '\\') { 324 // Check for \ at end of line, a line continuation. 325 // Advance over (discard) the newline 326 int cp = testString.codePointAt(charIdx); 327 if (cp == '\r' && charIdx<len && testString.codePointAt(charIdx+1) == '\n') { 328 // We have a CR LF 329 // Need an extra increment of the input ptr to move over both of them 330 charIdx++; 331 } 332 if (cp == '\n' || cp == '\r') { 333 lineNum++; 334 column = 0; 335 charIdx++; 336 colStart = charIdx; 337 break; 338 } 339 340 // Let unescape handle the back slash. 341 int charIdxAr[] = new int[1]; 342 charIdxAr[0] = charIdx; 343 cp = Utility.unescapeAt(testString, charIdxAr); 344 if (cp != -1) { 345 // Escape sequence was recognized. Insert the char 346 // into the test data. 347 charIdx = charIdxAr[0]; 348 tp.dataToBreak.appendCodePoint(cp); 349 for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) { 350 tp.srcLine[i] = lineNum; 351 tp.srcCol[i] = column; 352 } 353 354 break; 355 } 356 357 358 // Not a recognized backslash escape sequence. 359 // Take the next char as a literal. 360 // TODO: Should this be an error? 361 c = testString.codePointAt(charIdx); 362 charIdx = testString.offsetByCodePoints(charIdx, 1); 363 } 364 365 // Normal, non-escaped data char. 366 tp.dataToBreak.appendCodePoint(c); 367 368 // Save the mapping from offset in the data to line/column numbers in 369 // the original input file. Will be used for better error messages only. 370 // If there's an expected break before this char, the slot in the mapping 371 // vector will already be set for this char; don't overwrite it. 372 for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) { 373 tp.srcLine[i] = lineNum; 374 tp.srcCol[i] = column; 375 } 376 break; 377 378 379 case PARSE_NUM: 380 // We are parsing an expected numeric tag value, like <1234>, 381 // within a chunk of data. 382 if (UCharacter.isWhitespace(c)) { 383 break; 384 } 385 386 if (c == '>') { 387 // Finished the number. Add the info to the expected break data, 388 // and switch parse state back to doing plain data. 389 parseState = PARSE_DATA; 390 if (tagValue == 0) { 391 tagValue = -1; 392 } 393 int breakIdx = tp.dataToBreak.length(); 394 tp.expectedBreaks[breakIdx] = tagValue; 395 tp.srcLine[breakIdx] = lineNum; 396 tp.srcCol[breakIdx] = column; 397 break; 398 } 399 400 if (UCharacter.isDigit(c)) { 401 tagValue = tagValue*10 + UCharacter.digit(c); 402 break; 403 } 404 405 errln(String.format("Syntax Error in rbbitst.txt at line %d, col %d", lineNum, column)); 406 return; 407 } 408 } 409 410 // Reached end of test file. Raise an error if parseState indicates that we are 411 // within a block that should have been terminated. 412 if (parseState == PARSE_RULES) { 413 errln(String.format("rbbitst.txt:%d <rules> block beginning at line %d is not closed.", 414 lineNum, rulesFirstLine)); 415 } 416 if (parseState == PARSE_DATA) { 417 errln(String.format("rbbitst.txt:%d <data> block not closed.", lineNum)); 418 } 419 } 420 421 void executeTest(TestParams t) { 422 // TODO: also rerun tests with a break iterator re-created from bi.getRules() 423 // and from bi.clone(). If in exhaustive mode only. 424 int bp; 425 int prevBP; 426 int i; 427 428 if (t.bi == null) { 429 return; 430 } 431 432 t.bi.setText(t.dataToBreak.toString()); 433 // 434 // Run the iterator forward 435 // 436 prevBP = -1; 437 for (bp = t.bi.first(); bp != BreakIterator.DONE; bp = t.bi.next()) { 438 if (prevBP == bp) { 439 // Fail for lack of forward progress. 440 errln("Forward Iteration, no forward progress. Break Pos=" + bp + 441 " File line,col=" + t.srcLine[bp] + ", " + t.srcCol[bp]); 442 break; 443 } 444 445 // Check that there were we didn't miss an expected break between the last one 446 // and this one. 447 for (i=prevBP+1; i<bp; i++) { 448 if (t.expectedBreaks[i] != 0) { 449 errln("Forward Iteration, break expected, but not found. Pos=" + i + 450 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]); 451 } 452 } 453 454 // Check that the break we did find was expected 455 if (t.expectedBreaks[bp] == 0) { 456 errln("Forward Iteration, break found, but not expected. Pos=" + bp + 457 " File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]); 458 } else { 459 // The break was expected. 460 // Check that the {nnn} tag value is correct. 461 int expectedTagVal = t.expectedBreaks[bp]; 462 if (expectedTagVal == -1) { 463 expectedTagVal = 0; 464 } 465 int line = t.srcLine[bp]; 466 int rs = t.bi.getRuleStatus(); 467 if (rs != expectedTagVal) { 468 errln("Incorrect status for forward break. Pos = " + bp + 469 ". File line,col = " + line + ", " + t.srcCol[bp] + "\n" + 470 " Actual, Expected status = " + rs + ", " + expectedTagVal); 471 } 472 int[] fillInArray = new int[4]; 473 int numStatusVals = t.bi.getRuleStatusVec(fillInArray); 474 assertTrue("", numStatusVals >= 1); 475 assertEquals("", expectedTagVal, fillInArray[0]); 476 } 477 478 479 prevBP = bp; 480 } 481 482 // Verify that there were no missed expected breaks after the last one found 483 for (i=prevBP+1; i<t.dataToBreak.length()+1; i++) { 484 if (t.expectedBreaks[i] != 0) { 485 errln("Forward Iteration, break expected, but not found. Pos=" + i + 486 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]); 487 } 488 } 489 490 491 // 492 // Run the iterator backwards, verify that the same breaks are found. 493 // 494 prevBP = t.dataToBreak.length()+2; // start with a phony value for the last break pos seen. 495 for (bp = t.bi.last(); bp != BreakIterator.DONE; bp = t.bi.previous()) { 496 if (prevBP == bp) { 497 // Fail for lack of progress. 498 errln("Reverse Iteration, no progress. Break Pos=" + bp + 499 "File line,col=" + t.srcLine[bp] + " " + t.srcCol[bp]); 500 break; 501 } 502 503 // Check that we didn't miss an expected break between the last one 504 // and this one. (UVector returns zeros for index out of bounds.) 505 for (i=prevBP-1; i>bp; i--) { 506 if (t.expectedBreaks[i] != 0) { 507 errln("Reverse Itertion, break expected, but not found. Pos=" + i + 508 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]); 509 } 510 } 511 512 // Check that the break we did find was expected 513 if (t.expectedBreaks[bp] == 0) { 514 errln("Reverse Itertion, break found, but not expected. Pos=" + bp + 515 " File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]); 516 } else { 517 // The break was expected. 518 // Check that the {nnn} tag value is correct. 519 int expectedTagVal = t.expectedBreaks[bp]; 520 if (expectedTagVal == -1) { 521 expectedTagVal = 0; 522 } 523 int line = t.srcLine[bp]; 524 int rs = t.bi.getRuleStatus(); 525 if (rs != expectedTagVal) { 526 errln("Incorrect status for reverse break. Pos = " + bp + 527 " File line,col= " + line + ", " + t.srcCol[bp] + "\n" + 528 " Actual, Expected status = " + rs + ", " + expectedTagVal); 529 } 530 } 531 532 prevBP = bp; 533 } 534 535 // Verify that there were no missed breaks prior to the last one found 536 for (i=prevBP-1; i>=0; i--) { 537 if (t.expectedBreaks[i] != 0) { 538 errln("Reverse Itertion, break expected, but not found. Pos=" + i + 539 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]); 540 } 541 } 542 // Check isBoundary() 543 for (i=0; i<=t.dataToBreak.length(); i++) { 544 boolean boundaryExpected = (t.expectedBreaks[i] != 0); 545 boolean boundaryFound = t.bi.isBoundary(i); 546 if (boundaryExpected != boundaryFound) { 547 errln("isBoundary(" + i + ") incorrect.\n" + 548 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] + 549 " Expected, Actual= " + boundaryExpected + ", " + boundaryFound); 550 } 551 } 552 553 // Check following() 554 for (i=0; i<=t.dataToBreak.length(); i++) { 555 int actualBreak = t.bi.following(i); 556 int expectedBreak = BreakIterator.DONE; 557 for (int j=i+1; j < t.expectedBreaks.length; j++) { 558 if (t.expectedBreaks[j] != 0) { 559 expectedBreak = j; 560 break; 561 } 562 } 563 if (expectedBreak != actualBreak) { 564 errln("following(" + i + ") incorrect.\n" + 565 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] + 566 " Expected, Actual= " + expectedBreak + ", " + actualBreak); 567 } 568 } 569 570 // Check preceding() 571 for (i=t.dataToBreak.length(); i>=0; i--) { 572 int actualBreak = t.bi.preceding(i); 573 int expectedBreak = BreakIterator.DONE; 574 575 for (int j=i-1; j >= 0; j--) { 576 if (t.expectedBreaks[j] != 0) { 577 expectedBreak = j; 578 break; 579 } 580 } 581 if (expectedBreak != actualBreak) { 582 errln("preceding(" + i + ") incorrect.\n" + 583 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] + 584 " Expected, Actual= " + expectedBreak + ", " + actualBreak); 585 } 586 } 587 588 } 589 590 591 592 593 } 594