1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 * Created on May 5, 2004 5 * 6 * Copyright (C) 2004-2016 International Business Machines Corporation and others. 7 * All Rights Reserved. 8 * 9 */ 10 package com.ibm.icu.dev.test.rbbi; 11 12 import java.io.IOException; 13 import java.io.InputStream; 14 import java.io.InputStreamReader; 15 import java.util.Arrays; 16 17 import org.junit.Test; 18 import org.junit.runner.RunWith; 19 import org.junit.runners.JUnit4; 20 21 import com.ibm.icu.dev.test.TestFmwk; 22 import com.ibm.icu.impl.Utility; 23 import com.ibm.icu.lang.UCharacter; 24 import com.ibm.icu.text.BreakIterator; 25 import com.ibm.icu.text.RuleBasedBreakIterator; 26 import com.ibm.icu.util.ULocale; 27 28 29 /** 30 * Rule based break iterator data driven test. 31 * Perform the tests from the file rbbitst.txt. 32 * The test data file is common to both ICU4C and ICU4J. 33 * See the data file for a description of the tests. 34 * 35 */ 36 @RunWith(JUnit4.class) 37 public class RBBITestExtended extends TestFmwk { 38 public RBBITestExtended() { 39 } 40 41 42 43 static class TestParams { 44 BreakIterator bi; 45 StringBuilder dataToBreak = new StringBuilder(); 46 int[] expectedBreaks = new int[4000]; 47 int[] srcLine = new int[4000]; 48 int[] srcCol = new int[4000]; 49 ULocale currentLocale = new ULocale("en_US"); 50 } 51 52 53 @Test 54 public void TestExtended() { 55 TestParams tp = new TestParams(); 56 57 58 // 59 // Open and read the test data file. 60 // 61 StringBuilder testFileBuf = new StringBuilder(); 62 InputStream is = null; 63 try { 64 is = RBBITestExtended.class.getResourceAsStream("rbbitst.txt"); 65 if (is == null) { 66 errln("Could not open test data file rbbitst.txt"); 67 return; 68 } 69 InputStreamReader isr = new InputStreamReader(is, "UTF-8"); 70 try { 71 int c; 72 int count = 0; 73 for (;;) { 74 c = isr.read(); 75 if (c < 0) { 76 break; 77 } 78 count++; 79 if (c == 0xFEFF && count == 1) { 80 // BOM in the test data file. Discard it. 81 continue; 82 } 83 84 testFileBuf.appendCodePoint(c); 85 } 86 } finally { 87 isr.close(); 88 } 89 } catch (IOException e) { 90 errln(e.toString()); 91 try { 92 is.close(); 93 } catch (IOException ignored) { 94 } 95 return; 96 } 97 98 String testString = testFileBuf.toString(); 99 100 101 final int PARSE_COMMENT = 1; 102 final int PARSE_TAG = 2; 103 final int PARSE_DATA = 3; 104 final int PARSE_NUM = 4; 105 final int PARSE_RULES = 5; 106 107 int parseState = PARSE_TAG; 108 109 int savedState = PARSE_TAG; 110 111 int lineNum = 1; 112 int colStart = 0; 113 int column = 0; 114 int charIdx = 0; 115 int i; 116 117 int tagValue = 0; // The numeric value of a <nnn> tag. 118 119 StringBuilder rules = new StringBuilder(); // Holds rules from a <rules> ... </rules> block 120 int rulesFirstLine = 0; // Line number of the start of current <rules> block 121 122 int len = testString.length(); 123 124 for (charIdx = 0; charIdx < len; ) { 125 int c = testString.codePointAt(charIdx); 126 charIdx++; 127 if (c == '\r' && charIdx<len && testString.charAt(charIdx) == '\n') { 128 // treat CRLF as a unit 129 c = '\n'; 130 charIdx++; 131 } 132 if (c == '\n' || c == '\r') { 133 lineNum++; 134 colStart = charIdx; 135 } 136 column = charIdx - colStart + 1; 137 138 switch (parseState) { 139 case PARSE_COMMENT: 140 if (c == 0x0a || c == 0x0d) { 141 parseState = savedState; 142 } 143 break; 144 145 case PARSE_TAG: 146 { 147 if (c == '#') { 148 parseState = PARSE_COMMENT; 149 savedState = PARSE_TAG; 150 break; 151 } 152 if (UCharacter.isWhitespace(c)) { 153 break; 154 } 155 if (testString.startsWith("<word>", charIdx-1)) { 156 tp.bi = BreakIterator.getWordInstance(tp.currentLocale); 157 charIdx += 5; 158 break; 159 } 160 if (testString.startsWith("<char>", charIdx-1)) { 161 tp.bi = BreakIterator.getCharacterInstance(tp.currentLocale); 162 charIdx += 5; 163 break; 164 } 165 if (testString.startsWith("<line>", charIdx-1)) { 166 tp.bi = BreakIterator.getLineInstance(tp.currentLocale); 167 charIdx += 5; 168 break; 169 } 170 if (testString.startsWith("<sent>", charIdx-1)) { 171 tp.bi = BreakIterator.getSentenceInstance(tp.currentLocale); 172 charIdx += 5; 173 break; 174 } 175 if (testString.startsWith("<title>", charIdx-1)) { 176 tp.bi = BreakIterator.getTitleInstance(tp.currentLocale); 177 charIdx += 6; 178 break; 179 } 180 if (testString.startsWith("<rules>", charIdx-1) || 181 testString.startsWith("<badrules>", charIdx-1)) { 182 charIdx = testString.indexOf('>', charIdx) + 1; 183 parseState = PARSE_RULES; 184 rules.setLength(0); 185 rulesFirstLine = lineNum; 186 break; 187 } 188 189 if (testString.startsWith("<locale ", charIdx-1)) { 190 int closeIndex = testString.indexOf(">", charIdx); 191 if (closeIndex < 0) { 192 errln("line" + lineNum + ": missing close on <locale tag."); 193 break; 194 } 195 String localeName = testString.substring(charIdx+6, closeIndex); 196 localeName = localeName.trim(); 197 tp.currentLocale = new ULocale(localeName); 198 charIdx = closeIndex+1; 199 break; 200 } 201 if (testString.startsWith("<data>", charIdx-1)) { 202 parseState = PARSE_DATA; 203 charIdx += 5; 204 tp.dataToBreak.setLength(0); 205 Arrays.fill(tp.expectedBreaks, 0); 206 Arrays.fill(tp.srcCol, 0); 207 Arrays.fill(tp.srcLine, 0); 208 break; 209 } 210 211 errln("line" + lineNum + ": Tag expected in test file."); 212 return; 213 //parseState = PARSE_COMMENT; 214 //savedState = PARSE_DATA; 215 } 216 217 case PARSE_RULES: 218 if (testString.startsWith("</rules>", charIdx-1)) { 219 charIdx += 7; 220 parseState = PARSE_TAG; 221 try { 222 tp.bi = new RuleBasedBreakIterator(rules.toString()); 223 } catch (IllegalArgumentException e) { 224 errln(String.format("rbbitst.txt:%d Error creating break iterator from rules. %s", lineNum, e)); 225 } 226 } else if (testString.startsWith("</badrules>", charIdx-1)) { 227 charIdx += 10; 228 parseState = PARSE_TAG; 229 boolean goodRules = true; 230 try { 231 new RuleBasedBreakIterator(rules.toString()); 232 } catch (IllegalArgumentException e) { 233 goodRules = false; 234 } 235 if (goodRules) { 236 errln(String.format( 237 "rbbitst.txt:%d Expected, but did not get, a failure creating break iterator from rules.", 238 lineNum)); 239 } 240 } else { 241 rules.appendCodePoint(c); 242 } 243 break; 244 245 case PARSE_DATA: 246 if (c == '') { 247 int breakIdx = tp.dataToBreak.length(); 248 tp.expectedBreaks[breakIdx] = -1; 249 tp.srcLine[breakIdx] = lineNum; 250 tp.srcCol[breakIdx] = column; 251 break; 252 } 253 254 if (testString.startsWith("</data>", charIdx-1)) { 255 // Add final entry to mappings from break location to source file position. 256 // Need one extra because last break position returned is after the 257 // last char in the data, not at the last char. 258 int idx = tp.dataToBreak.length(); 259 tp.srcLine[idx] = lineNum; 260 tp.srcCol[idx] = column; 261 262 parseState = PARSE_TAG; 263 charIdx += 6; 264 265 // RUN THE TEST! 266 executeTest(tp); 267 break; 268 } 269 270 if (testString.startsWith("\\N{", charIdx-1)) { 271 int nameEndIdx = testString.indexOf('}', charIdx); 272 if (nameEndIdx == -1) { 273 errln("Error in named character in test file at line " + lineNum + 274 ", col " + column); 275 } 276 // Named character, e.g. \N{COMBINING GRAVE ACCENT} 277 // Get the code point from the name and insert it into the test data. 278 String charName = testString.substring(charIdx+2, nameEndIdx); 279 c = UCharacter.getCharFromName(charName); 280 if (c == -1) { 281 errln("Error in named character in test file at line " + lineNum + 282 ", col " + column); 283 } else { 284 // Named code point was recognized. Insert it 285 // into the test data. 286 tp.dataToBreak.appendCodePoint(c); 287 for (i = tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) { 288 tp.srcLine[i] = lineNum; 289 tp.srcCol[i] = column; 290 } 291 292 } 293 if (nameEndIdx > charIdx) { 294 charIdx = nameEndIdx+1; 295 } 296 break; 297 } 298 299 if (testString.startsWith("<>", charIdx-1)) { 300 charIdx++; 301 int breakIdx = tp.dataToBreak.length(); 302 tp.expectedBreaks[breakIdx] = -1; 303 tp.srcLine[breakIdx] = lineNum; 304 tp.srcCol[breakIdx] = column; 305 break; 306 } 307 308 if (c == '<') { 309 tagValue = 0; 310 parseState = PARSE_NUM; 311 break; 312 } 313 314 if (c == '#' && column==3) { // TODO: why is column off so far? 315 parseState = PARSE_COMMENT; 316 savedState = PARSE_DATA; 317 break; 318 } 319 320 if (c == '\\') { 321 // Check for \ at end of line, a line continuation. 322 // Advance over (discard) the newline 323 int cp = testString.codePointAt(charIdx); 324 if (cp == '\r' && charIdx<len && testString.codePointAt(charIdx+1) == '\n') { 325 // We have a CR LF 326 // Need an extra increment of the input ptr to move over both of them 327 charIdx++; 328 } 329 if (cp == '\n' || cp == '\r') { 330 lineNum++; 331 column = 0; 332 charIdx++; 333 colStart = charIdx; 334 break; 335 } 336 337 // Let unescape handle the back slash. 338 int charIdxAr[] = new int[1]; 339 charIdxAr[0] = charIdx; 340 cp = Utility.unescapeAt(testString, charIdxAr); 341 if (cp != -1) { 342 // Escape sequence was recognized. Insert the char 343 // into the test data. 344 charIdx = charIdxAr[0]; 345 tp.dataToBreak.appendCodePoint(cp); 346 for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) { 347 tp.srcLine[i] = lineNum; 348 tp.srcCol[i] = column; 349 } 350 351 break; 352 } 353 354 355 // Not a recognized backslash escape sequence. 356 // Take the next char as a literal. 357 // TODO: Should this be an error? 358 c = testString.codePointAt(charIdx); 359 charIdx = testString.offsetByCodePoints(charIdx, 1); 360 } 361 362 // Normal, non-escaped data char. 363 tp.dataToBreak.appendCodePoint(c); 364 365 // Save the mapping from offset in the data to line/column numbers in 366 // the original input file. Will be used for better error messages only. 367 // If there's an expected break before this char, the slot in the mapping 368 // vector will already be set for this char; don't overwrite it. 369 for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) { 370 tp.srcLine[i] = lineNum; 371 tp.srcCol[i] = column; 372 } 373 break; 374 375 376 case PARSE_NUM: 377 // We are parsing an expected numeric tag value, like <1234>, 378 // within a chunk of data. 379 if (UCharacter.isWhitespace(c)) { 380 break; 381 } 382 383 if (c == '>') { 384 // Finished the number. Add the info to the expected break data, 385 // and switch parse state back to doing plain data. 386 parseState = PARSE_DATA; 387 if (tagValue == 0) { 388 tagValue = -1; 389 } 390 int breakIdx = tp.dataToBreak.length(); 391 tp.expectedBreaks[breakIdx] = tagValue; 392 tp.srcLine[breakIdx] = lineNum; 393 tp.srcCol[breakIdx] = column; 394 break; 395 } 396 397 if (UCharacter.isDigit(c)) { 398 tagValue = tagValue*10 + UCharacter.digit(c); 399 break; 400 } 401 402 errln(String.format("Syntax Error in rbbitst.txt at line %d, col %d", lineNum, column)); 403 return; 404 } 405 } 406 407 // Reached end of test file. Raise an error if parseState indicates that we are 408 // within a block that should have been terminated. 409 if (parseState == PARSE_RULES) { 410 errln(String.format("rbbitst.txt:%d <rules> block beginning at line %d is not closed.", 411 lineNum, rulesFirstLine)); 412 } 413 if (parseState == PARSE_DATA) { 414 errln(String.format("rbbitst.txt:%d <data> block not closed.", lineNum)); 415 } 416 } 417 418 void executeTest(TestParams t) { 419 // TODO: also rerun tests with a break iterator re-created from bi.getRules() 420 // and from bi.clone(). If in exhaustive mode only. 421 int bp; 422 int prevBP; 423 int i; 424 425 if (t.bi == null) { 426 return; 427 } 428 429 t.bi.setText(t.dataToBreak.toString()); 430 // 431 // Run the iterator forward 432 // 433 prevBP = -1; 434 for (bp = t.bi.first(); bp != BreakIterator.DONE; bp = t.bi.next()) { 435 if (prevBP == bp) { 436 // Fail for lack of forward progress. 437 errln("Forward Iteration, no forward progress. Break Pos=" + bp + 438 " File line,col=" + t.srcLine[bp] + ", " + t.srcCol[bp]); 439 break; 440 } 441 442 // Check that there were we didn't miss an expected break between the last one 443 // and this one. 444 for (i=prevBP+1; i<bp; i++) { 445 if (t.expectedBreaks[i] != 0) { 446 errln("Forward Iteration, break expected, but not found. Pos=" + i + 447 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]); 448 } 449 } 450 451 // Check that the break we did find was expected 452 if (t.expectedBreaks[bp] == 0) { 453 errln("Forward Iteration, break found, but not expected. Pos=" + bp + 454 " File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]); 455 } else { 456 // The break was expected. 457 // Check that the {nnn} tag value is correct. 458 int expectedTagVal = t.expectedBreaks[bp]; 459 if (expectedTagVal == -1) { 460 expectedTagVal = 0; 461 } 462 int line = t.srcLine[bp]; 463 int rs = t.bi.getRuleStatus(); 464 if (rs != expectedTagVal) { 465 errln("Incorrect status for forward break. Pos = " + bp + 466 ". File line,col = " + line + ", " + t.srcCol[bp] + "\n" + 467 " Actual, Expected status = " + rs + ", " + expectedTagVal); 468 } 469 int[] fillInArray = new int[4]; 470 int numStatusVals = t.bi.getRuleStatusVec(fillInArray); 471 assertTrue("", numStatusVals >= 1); 472 assertEquals("", expectedTagVal, fillInArray[0]); 473 } 474 475 476 prevBP = bp; 477 } 478 479 // Verify that there were no missed expected breaks after the last one found 480 for (i=prevBP+1; i<t.dataToBreak.length()+1; i++) { 481 if (t.expectedBreaks[i] != 0) { 482 errln("Forward Iteration, break expected, but not found. Pos=" + i + 483 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]); 484 } 485 } 486 487 488 // 489 // Run the iterator backwards, verify that the same breaks are found. 490 // 491 prevBP = t.dataToBreak.length()+2; // start with a phony value for the last break pos seen. 492 for (bp = t.bi.last(); bp != BreakIterator.DONE; bp = t.bi.previous()) { 493 if (prevBP == bp) { 494 // Fail for lack of progress. 495 errln("Reverse Iteration, no progress. Break Pos=" + bp + 496 "File line,col=" + t.srcLine[bp] + " " + t.srcCol[bp]); 497 break; 498 } 499 500 // Check that we didn't miss an expected break between the last one 501 // and this one. (UVector returns zeros for index out of bounds.) 502 for (i=prevBP-1; i>bp; i--) { 503 if (t.expectedBreaks[i] != 0) { 504 errln("Reverse Itertion, break expected, but not found. Pos=" + i + 505 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]); 506 } 507 } 508 509 // Check that the break we did find was expected 510 if (t.expectedBreaks[bp] == 0) { 511 errln("Reverse Itertion, break found, but not expected. Pos=" + bp + 512 " File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]); 513 } else { 514 // The break was expected. 515 // Check that the {nnn} tag value is correct. 516 int expectedTagVal = t.expectedBreaks[bp]; 517 if (expectedTagVal == -1) { 518 expectedTagVal = 0; 519 } 520 int line = t.srcLine[bp]; 521 int rs = t.bi.getRuleStatus(); 522 if (rs != expectedTagVal) { 523 errln("Incorrect status for reverse break. Pos = " + bp + 524 " File line,col= " + line + ", " + t.srcCol[bp] + "\n" + 525 " Actual, Expected status = " + rs + ", " + expectedTagVal); 526 } 527 } 528 529 prevBP = bp; 530 } 531 532 // Verify that there were no missed breaks prior to the last one found 533 for (i=prevBP-1; i>=0; i--) { 534 if (t.expectedBreaks[i] != 0) { 535 errln("Reverse Itertion, break expected, but not found. Pos=" + i + 536 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]); 537 } 538 } 539 // Check isBoundary() 540 for (i=0; i<=t.dataToBreak.length(); i++) { 541 boolean boundaryExpected = (t.expectedBreaks[i] != 0); 542 boolean boundaryFound = t.bi.isBoundary(i); 543 if (boundaryExpected != boundaryFound) { 544 errln("isBoundary(" + i + ") incorrect.\n" + 545 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] + 546 " Expected, Actual= " + boundaryExpected + ", " + boundaryFound); 547 } 548 } 549 550 // Check following() 551 for (i=0; i<=t.dataToBreak.length(); i++) { 552 int actualBreak = t.bi.following(i); 553 int expectedBreak = BreakIterator.DONE; 554 for (int j=i+1; j < t.expectedBreaks.length; j++) { 555 if (t.expectedBreaks[j] != 0) { 556 expectedBreak = j; 557 break; 558 } 559 } 560 if (expectedBreak != actualBreak) { 561 errln("following(" + i + ") incorrect.\n" + 562 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] + 563 " Expected, Actual= " + expectedBreak + ", " + actualBreak); 564 } 565 } 566 567 // Check preceding() 568 for (i=t.dataToBreak.length(); i>=0; i--) { 569 int actualBreak = t.bi.preceding(i); 570 int expectedBreak = BreakIterator.DONE; 571 572 for (int j=i-1; j >= 0; j--) { 573 if (t.expectedBreaks[j] != 0) { 574 expectedBreak = j; 575 break; 576 } 577 } 578 if (expectedBreak != actualBreak) { 579 errln("preceding(" + i + ") incorrect.\n" + 580 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] + 581 " Expected, Actual= " + expectedBreak + ", " + actualBreak); 582 } 583 } 584 585 } 586 587 588 589 590 } 591