1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 1996-2016, International Business Machines Corporation and * 6 * others. All Rights Reserved. * 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.dev.test.rbbi; 10 11 import java.text.StringCharacterIterator; 12 import java.util.ArrayList; 13 import java.util.List; 14 import java.util.Locale; 15 16 import org.junit.Before; 17 import org.junit.Test; 18 19 import com.ibm.icu.dev.test.TestFmwk; 20 import com.ibm.icu.text.BreakIterator; 21 import com.ibm.icu.text.FilteredBreakIteratorBuilder; 22 import com.ibm.icu.util.ULocale; 23 24 public class BreakIteratorTest extends TestFmwk 25 { 26 private BreakIterator characterBreak; 27 private BreakIterator wordBreak; 28 private BreakIterator lineBreak; 29 private BreakIterator sentenceBreak; 30 private BreakIterator titleBreak; 31 32 public BreakIteratorTest() 33 { 34 35 } 36 37 @Before 38 public void init(){ 39 characterBreak = BreakIterator.getCharacterInstance(); 40 wordBreak = BreakIterator.getWordInstance(); 41 lineBreak = BreakIterator.getLineInstance(); 42 //logln("Creating sentence iterator..."); 43 sentenceBreak = BreakIterator.getSentenceInstance(); 44 //logln("Finished creating sentence iterator..."); 45 titleBreak = BreakIterator.getTitleInstance(); 46 } 47 //========================================================================= 48 // general test subroutines 49 //========================================================================= 50 51 private void generalIteratorTest(BreakIterator bi, List<String> expectedResult) { 52 StringBuffer buffer = new StringBuffer(); 53 String text; 54 for (int i = 0; i < expectedResult.size(); i++) { 55 text = expectedResult.get(i); 56 buffer.append(text); 57 } 58 text = buffer.toString(); 59 60 bi.setText(text); 61 62 List<String> nextResults = _testFirstAndNext(bi, text); 63 List<String> previousResults = _testLastAndPrevious(bi, text); 64 65 logln("comparing forward and backward..."); 66 //TODO(junit) - needs to be rewritten 67 //int errs = getErrorCount(); 68 compareFragmentLists("forward iteration", "backward iteration", nextResults, 69 previousResults); 70 //if (getErrorCount() == errs) { 71 logln("comparing expected and actual..."); 72 compareFragmentLists("expected result", "actual result", expectedResult, 73 nextResults); 74 logln("comparing expected and actual..."); 75 compareFragmentLists("expected result", "actual result", expectedResult, 76 nextResults); 77 //} 78 79 int[] boundaries = new int[expectedResult.size() + 3]; 80 boundaries[0] = BreakIterator.DONE; 81 boundaries[1] = 0; 82 for (int i = 0; i < expectedResult.size(); i++) 83 boundaries[i + 2] = boundaries[i + 1] + (expectedResult.get(i)). 84 length(); 85 boundaries[boundaries.length - 1] = BreakIterator.DONE; 86 87 _testFollowing(bi, text, boundaries); 88 _testPreceding(bi, text, boundaries); 89 _testIsBoundary(bi, text, boundaries); 90 91 doMultipleSelectionTest(bi, text); 92 } 93 94 private List<String> _testFirstAndNext(BreakIterator bi, String text) { 95 int p = bi.first(); 96 int lastP = p; 97 List<String> result = new ArrayList<String>(); 98 99 if (p != 0) 100 errln("first() returned " + p + " instead of 0"); 101 while (p != BreakIterator.DONE) { 102 p = bi.next(); 103 if (p != BreakIterator.DONE) { 104 if (p <= lastP) 105 errln("next() failed to move forward: next() on position " 106 + lastP + " yielded " + p); 107 108 result.add(text.substring(lastP, p)); 109 } 110 else { 111 if (lastP != text.length()) 112 errln("next() returned DONE prematurely: offset was " 113 + lastP + " instead of " + text.length()); 114 } 115 lastP = p; 116 } 117 return result; 118 } 119 120 private List<String> _testLastAndPrevious(BreakIterator bi, String text) { 121 int p = bi.last(); 122 int lastP = p; 123 List<String> result = new ArrayList<String>(); 124 125 if (p != text.length()) 126 errln("last() returned " + p + " instead of " + text.length()); 127 while (p != BreakIterator.DONE) { 128 p = bi.previous(); 129 if (p != BreakIterator.DONE) { 130 if (p >= lastP) 131 errln("previous() failed to move backward: previous() on position " 132 + lastP + " yielded " + p); 133 134 result.add(0, text.substring(p, lastP)); 135 } 136 else { 137 if (lastP != 0) 138 errln("previous() returned DONE prematurely: offset was " 139 + lastP + " instead of 0"); 140 } 141 lastP = p; 142 } 143 return result; 144 } 145 146 private void compareFragmentLists(String f1Name, String f2Name, List<String> f1, List<String> f2) { 147 int p1 = 0; 148 int p2 = 0; 149 String s1; 150 String s2; 151 int t1 = 0; 152 int t2 = 0; 153 154 while (p1 < f1.size() && p2 < f2.size()) { 155 s1 = f1.get(p1); 156 s2 = f2.get(p2); 157 t1 += s1.length(); 158 t2 += s2.length(); 159 160 if (s1.equals(s2)) { 161 debugLogln(" >" + s1 + "<"); 162 ++p1; 163 ++p2; 164 } 165 else { 166 int tempT1 = t1; 167 int tempT2 = t2; 168 int tempP1 = p1; 169 int tempP2 = p2; 170 171 while (tempT1 != tempT2 && tempP1 < f1.size() && tempP2 < f2.size()) { 172 while (tempT1 < tempT2 && tempP1 < f1.size()) { 173 tempT1 += (f1.get(tempP1)).length(); 174 ++tempP1; 175 } 176 while (tempT2 < tempT1 && tempP2 < f2.size()) { 177 tempT2 += (f2.get(tempP2)).length(); 178 ++tempP2; 179 } 180 } 181 logln("*** " + f1Name + " has:"); 182 while (p1 <= tempP1 && p1 < f1.size()) { 183 s1 = f1.get(p1); 184 t1 += s1.length(); 185 debugLogln(" *** >" + s1 + "<"); 186 ++p1; 187 } 188 logln("***** " + f2Name + " has:"); 189 while (p2 <= tempP2 && p2 < f2.size()) { 190 s2 = f2.get(p2); 191 t2 += s2.length(); 192 debugLogln(" ***** >" + s2 + "<"); 193 ++p2; 194 } 195 errln("Discrepancy between " + f1Name + " and " + f2Name); 196 } 197 } 198 } 199 200 private void _testFollowing(BreakIterator bi, String text, int[] boundaries) { 201 logln("testFollowing():"); 202 int p = 2; 203 for (int i = 0; i <= text.length(); i++) { 204 if (i == boundaries[p]) 205 ++p; 206 207 int b = bi.following(i); 208 logln("bi.following(" + i + ") -> " + b); 209 if (b != boundaries[p]) 210 errln("Wrong result from following() for " + i + ": expected " + boundaries[p] 211 + ", got " + b); 212 } 213 } 214 215 private void _testPreceding(BreakIterator bi, String text, int[] boundaries) { 216 logln("testPreceding():"); 217 int p = 0; 218 for (int i = 0; i <= text.length(); i++) { 219 int b = bi.preceding(i); 220 logln("bi.preceding(" + i + ") -> " + b); 221 if (b != boundaries[p]) 222 errln("Wrong result from preceding() for " + i + ": expected " + boundaries[p] 223 + ", got " + b); 224 225 if (i == boundaries[p + 1]) 226 ++p; 227 } 228 } 229 230 private void _testIsBoundary(BreakIterator bi, String text, int[] boundaries) { 231 logln("testIsBoundary():"); 232 int p = 1; 233 boolean isB; 234 for (int i = 0; i <= text.length(); i++) { 235 isB = bi.isBoundary(i); 236 logln("bi.isBoundary(" + i + ") -> " + isB); 237 238 if (i == boundaries[p]) { 239 if (!isB) 240 errln("Wrong result from isBoundary() for " + i + ": expected true, got false"); 241 ++p; 242 } 243 else { 244 if (isB) 245 errln("Wrong result from isBoundary() for " + i + ": expected false, got true"); 246 } 247 } 248 } 249 250 private void doMultipleSelectionTest(BreakIterator iterator, String testText) 251 { 252 logln("Multiple selection test..."); 253 BreakIterator testIterator = (BreakIterator)iterator.clone(); 254 int offset = iterator.first(); 255 int testOffset; 256 int count = 0; 257 258 do { 259 testOffset = testIterator.first(); 260 testOffset = testIterator.next(count); 261 logln("next(" + count + ") -> " + testOffset); 262 if (offset != testOffset) 263 errln("next(n) and next() not returning consistent results: for step " + count + ", next(n) returned " + testOffset + " and next() had " + offset); 264 265 if (offset != BreakIterator.DONE) { 266 count++; 267 offset = iterator.next(); 268 } 269 } while (offset != BreakIterator.DONE); 270 271 // now do it backwards... 272 offset = iterator.last(); 273 count = 0; 274 275 do { 276 testOffset = testIterator.last(); 277 testOffset = testIterator.next(count); 278 logln("next(" + count + ") -> " + testOffset); 279 if (offset != testOffset) 280 errln("next(n) and next() not returning consistent results: for step " + count + ", next(n) returned " + testOffset + " and next() had " + offset); 281 282 if (offset != BreakIterator.DONE) { 283 count--; 284 offset = iterator.previous(); 285 } 286 } while (offset != BreakIterator.DONE); 287 } 288 289 290 private void doOtherInvariantTest(BreakIterator tb, String testChars) 291 { 292 StringBuffer work = new StringBuffer("a\r\na"); 293 int errorCount = 0; 294 295 // a break should never occur between CR and LF 296 for (int i = 0; i < testChars.length(); i++) { 297 work.setCharAt(0, testChars.charAt(i)); 298 for (int j = 0; j < testChars.length(); j++) { 299 work.setCharAt(3, testChars.charAt(j)); 300 tb.setText(work.toString()); 301 for (int k = tb.first(); k != BreakIterator.DONE; k = tb.next()) 302 if (k == 2) { 303 errln("Break between CR and LF in string U+" + Integer.toHexString( 304 (work.charAt(0))) + ", U+d U+a U+" + Integer.toHexString( 305 (work.charAt(3)))); 306 errorCount++; 307 if (errorCount >= 75) 308 return; 309 } 310 } 311 } 312 313 // a break should never occur before a non-spacing mark, unless it's preceded 314 // by a line terminator 315 work.setLength(0); 316 work.append("aaaa"); 317 for (int i = 0; i < testChars.length(); i++) { 318 char c = testChars.charAt(i); 319 if (c == '\n' || c == '\r' || c == '\u2029' || c == '\u2028' || c == '\u0003') 320 continue; 321 work.setCharAt(1, c); 322 for (int j = 0; j < testChars.length(); j++) { 323 c = testChars.charAt(j); 324 if (Character.getType(c) != Character.NON_SPACING_MARK && Character.getType(c) 325 != Character.ENCLOSING_MARK) 326 continue; 327 work.setCharAt(2, c); 328 tb.setText(work.toString()); 329 for (int k = tb.first(); k != BreakIterator.DONE; k = tb.next()) 330 if (k == 2) { 331 errln("Break between U+" + Integer.toHexString((work.charAt(1))) 332 + " and U+" + Integer.toHexString((work.charAt(2)))); 333 errorCount++; 334 if (errorCount >= 75) 335 return; 336 } 337 } 338 } 339 } 340 341 public void debugLogln(String s) { 342 final String zeros = "0000"; 343 String temp; 344 StringBuffer out = new StringBuffer(); 345 for (int i = 0; i < s.length(); i++) { 346 char c = s.charAt(i); 347 if (c >= ' ' && c < '\u007f') 348 out.append(c); 349 else { 350 out.append("\\u"); 351 temp = Integer.toHexString(c); 352 out.append(zeros.substring(0, 4 - temp.length())); 353 out.append(temp); 354 } 355 } 356 logln(out.toString()); 357 } 358 359 //========================================================================= 360 // tests 361 //========================================================================= 362 363 364 /** 365 * @bug 4097779 366 */ 367 @Test 368 public void TestBug4097779() { 369 List<String> wordSelectionData = new ArrayList<String>(2); 370 371 wordSelectionData.add("aa\u0300a"); 372 wordSelectionData.add(" "); 373 374 generalIteratorTest(wordBreak, wordSelectionData); 375 } 376 377 /** 378 * @bug 4098467 379 */ 380 @Test 381 public void TestBug4098467Words() { 382 List<String> wordSelectionData = new ArrayList<String>(); 383 384 // What follows is a string of Korean characters (I found it in the Yellow Pages 385 // ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed 386 // it correctly), first as precomposed syllables, and then as conjoining jamo. 387 // Both sequences should be semantically identical and break the same way. 388 // precomposed syllables... 389 wordSelectionData.add("\uc0c1\ud56d"); 390 wordSelectionData.add(" "); 391 wordSelectionData.add("\ud55c\uc778"); 392 wordSelectionData.add(" "); 393 wordSelectionData.add("\uc5f0\ud569"); 394 wordSelectionData.add(" "); 395 wordSelectionData.add("\uc7a5\ub85c\uad50\ud68c"); 396 wordSelectionData.add(" "); 397 // conjoining jamo... 398 wordSelectionData.add("\u1109\u1161\u11bc\u1112\u1161\u11bc"); 399 wordSelectionData.add(" "); 400 wordSelectionData.add("\u1112\u1161\u11ab\u110b\u1175\u11ab"); 401 wordSelectionData.add(" "); 402 wordSelectionData.add("\u110b\u1167\u11ab\u1112\u1161\u11b8"); 403 wordSelectionData.add(" "); 404 wordSelectionData.add("\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c"); 405 wordSelectionData.add(" "); 406 407 generalIteratorTest(wordBreak, wordSelectionData); 408 } 409 410 411 /** 412 * @bug 4111338 413 */ 414 @Test 415 public void TestBug4111338() { 416 List<String> sentenceSelectionData = new ArrayList<String>(); 417 418 // test for bug #4111338: Don't break sentences at the boundary between CJK 419 // and other letters 420 sentenceSelectionData.add("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165:\"JAVA\u821c" 421 + "\u8165\u7fc8\u51ce\u306d,\u2494\u56d8\u4ec0\u60b1\u8560\u51ba" 422 + "\u611d\u57b6\u2510\u5d46\".\u2029"); 423 sentenceSelectionData.add("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8" 424 + "\u97e4JAVA\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0" 425 + "\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2029"); 426 sentenceSelectionData.add("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8\u97e4" 427 + "\u6470\u8790JAVA\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8" 428 + "\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2029"); 429 sentenceSelectionData.add("He said, \"I can go there.\"\u2029"); 430 431 generalIteratorTest(sentenceBreak, sentenceSelectionData); 432 } 433 434 435 /** 436 * @bug 4143071 437 */ 438 @Test 439 public void TestBug4143071() { 440 List<String> sentenceSelectionData = new ArrayList<String>(3); 441 442 // Make sure sentences that end with digits work right 443 sentenceSelectionData.add("Today is the 27th of May, 1998. "); 444 sentenceSelectionData.add("Tomorrow will be 28 May 1998. "); 445 sentenceSelectionData.add("The day after will be the 30th.\u2029"); 446 447 generalIteratorTest(sentenceBreak, sentenceSelectionData); 448 } 449 450 /** 451 * @bug 4152416 452 */ 453 @Test 454 public void TestBug4152416() { 455 List<String> sentenceSelectionData = new ArrayList<String>(2); 456 457 // Make sure sentences ending with a capital letter are treated correctly 458 sentenceSelectionData.add("The type of all primitive " 459 + "<code>boolean</code> values accessed in the target VM. "); 460 sentenceSelectionData.add("Calls to xxx will return an " 461 + "implementor of this interface.\u2029"); 462 463 generalIteratorTest(sentenceBreak, sentenceSelectionData); 464 } 465 466 /** 467 * @bug 4152117 468 */ 469 @Test 470 public void TestBug4152117() { 471 List<String> sentenceSelectionData = new ArrayList<String>(3); 472 473 // Make sure sentence breaking is handling punctuation correctly 474 // [COULD NOT REPRODUCE THIS BUG, BUT TEST IS HERE TO MAKE SURE 475 // IT DOESN'T CROP UP] 476 sentenceSelectionData.add("Constructs a randomly generated " 477 + "BigInteger, uniformly distributed over the range <tt>0</tt> " 478 + "to <tt>(2<sup>numBits</sup> - 1)</tt>, inclusive. "); 479 sentenceSelectionData.add("The uniformity of the distribution " 480 + "assumes that a fair source of random bits is provided in " 481 + "<tt>rnd</tt>. "); 482 sentenceSelectionData.add("Note that this constructor always " 483 + "constructs a non-negative BigInteger.\u2029"); 484 485 generalIteratorTest(sentenceBreak, sentenceSelectionData); 486 } 487 488 @Test 489 public void TestLineBreak() { 490 List<String> lineSelectionData = new ArrayList<String>(); 491 492 lineSelectionData.add("Multi-"); 493 lineSelectionData.add("Level "); 494 lineSelectionData.add("example "); 495 lineSelectionData.add("of "); 496 lineSelectionData.add("a "); 497 lineSelectionData.add("semi-"); 498 lineSelectionData.add("idiotic "); 499 lineSelectionData.add("non-"); 500 lineSelectionData.add("sensical "); 501 lineSelectionData.add("(non-"); 502 lineSelectionData.add("important) "); 503 lineSelectionData.add("sentence. "); 504 505 lineSelectionData.add("Hi "); 506 lineSelectionData.add("Hello "); 507 lineSelectionData.add("How\n"); 508 lineSelectionData.add("are\r"); 509 lineSelectionData.add("you\u2028"); 510 lineSelectionData.add("fine.\t"); 511 lineSelectionData.add("good. "); 512 513 lineSelectionData.add("Now\r"); 514 lineSelectionData.add("is\n"); 515 lineSelectionData.add("the\r\n"); 516 lineSelectionData.add("time\n"); 517 lineSelectionData.add("\r"); 518 lineSelectionData.add("for\r"); 519 lineSelectionData.add("\r"); 520 lineSelectionData.add("all"); 521 522 generalIteratorTest(lineBreak, lineSelectionData); 523 } 524 525 /** 526 * @bug 4068133 527 */ 528 @Test 529 public void TestBug4068133() { 530 List<String> lineSelectionData = new ArrayList<String>(9); 531 532 lineSelectionData.add("\u96f6"); 533 lineSelectionData.add("\u4e00\u3002"); 534 lineSelectionData.add("\u4e8c\u3001"); 535 lineSelectionData.add("\u4e09\u3002\u3001"); 536 lineSelectionData.add("\u56db\u3001\u3002\u3001"); 537 lineSelectionData.add("\u4e94,"); 538 lineSelectionData.add("\u516d."); 539 lineSelectionData.add("\u4e03.\u3001,\u3002"); 540 lineSelectionData.add("\u516b"); 541 542 generalIteratorTest(lineBreak, lineSelectionData); 543 } 544 545 /** 546 * @bug 4086052 547 */ 548 @Test 549 public void TestBug4086052() { 550 List<String> lineSelectionData = new ArrayList<String>(1); 551 552 lineSelectionData.add("foo\u00a0bar "); 553 // lineSelectionData.addElement("foo\ufeffbar"); 554 555 generalIteratorTest(lineBreak, lineSelectionData); 556 } 557 558 /** 559 * @bug 4097920 560 */ 561 @Test 562 public void TestBug4097920() { 563 List<String> lineSelectionData = new ArrayList<String>(3); 564 565 lineSelectionData.add("dog,cat,mouse "); 566 lineSelectionData.add("(one)"); 567 lineSelectionData.add("(two)\n"); 568 generalIteratorTest(lineBreak, lineSelectionData); 569 } 570 571 572 573 /** 574 * @bug 4117554 575 */ 576 @Test 577 public void TestBug4117554Lines() { 578 List<String> lineSelectionData = new ArrayList<String>(3); 579 580 // Fullwidth .!? should be treated as postJwrd 581 lineSelectionData.add("\u4e01\uff0e"); 582 lineSelectionData.add("\u4e02\uff01"); 583 lineSelectionData.add("\u4e03\uff1f"); 584 585 generalIteratorTest(lineBreak, lineSelectionData); 586 } 587 588 @Test 589 public void TestLettersAndDigits() { 590 // a character sequence such as "X11" or "30F3" or "native2ascii" should 591 // be kept together as a single word 592 List<String> lineSelectionData = new ArrayList<String>(3); 593 594 lineSelectionData.add("X11 "); 595 lineSelectionData.add("30F3 "); 596 lineSelectionData.add("native2ascii"); 597 598 generalIteratorTest(lineBreak, lineSelectionData); 599 } 600 601 602 private static final String graveS = "S\u0300"; 603 private static final String acuteBelowI = "i\u0317"; 604 private static final String acuteE = "e\u0301"; 605 private static final String circumflexA = "a\u0302"; 606 private static final String tildeE = "e\u0303"; 607 608 @Test 609 public void TestCharacterBreak() { 610 List<String> characterSelectionData = new ArrayList<String>(); 611 612 characterSelectionData.add(graveS); 613 characterSelectionData.add(acuteBelowI); 614 characterSelectionData.add("m"); 615 characterSelectionData.add("p"); 616 characterSelectionData.add("l"); 617 characterSelectionData.add(acuteE); 618 characterSelectionData.add(" "); 619 characterSelectionData.add("s"); 620 characterSelectionData.add(circumflexA); 621 characterSelectionData.add("m"); 622 characterSelectionData.add("p"); 623 characterSelectionData.add("l"); 624 characterSelectionData.add(tildeE); 625 characterSelectionData.add("."); 626 characterSelectionData.add("w"); 627 characterSelectionData.add(circumflexA); 628 characterSelectionData.add("w"); 629 characterSelectionData.add("a"); 630 characterSelectionData.add("f"); 631 characterSelectionData.add("q"); 632 characterSelectionData.add("\n"); 633 characterSelectionData.add("\r"); 634 characterSelectionData.add("\r\n"); 635 characterSelectionData.add("\n"); 636 637 generalIteratorTest(characterBreak, characterSelectionData); 638 } 639 640 /** 641 * @bug 4098467 642 */ 643 @Test 644 public void TestBug4098467Characters() { 645 List<String> characterSelectionData = new ArrayList<String>(); 646 647 // What follows is a string of Korean characters (I found it in the Yellow Pages 648 // ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed 649 // it correctly), first as precomposed syllables, and then as conjoining jamo. 650 // Both sequences should be semantically identical and break the same way. 651 // precomposed syllables... 652 characterSelectionData.add("\uc0c1"); 653 characterSelectionData.add("\ud56d"); 654 characterSelectionData.add(" "); 655 characterSelectionData.add("\ud55c"); 656 characterSelectionData.add("\uc778"); 657 characterSelectionData.add(" "); 658 characterSelectionData.add("\uc5f0"); 659 characterSelectionData.add("\ud569"); 660 characterSelectionData.add(" "); 661 characterSelectionData.add("\uc7a5"); 662 characterSelectionData.add("\ub85c"); 663 characterSelectionData.add("\uad50"); 664 characterSelectionData.add("\ud68c"); 665 characterSelectionData.add(" "); 666 // conjoining jamo... 667 characterSelectionData.add("\u1109\u1161\u11bc"); 668 characterSelectionData.add("\u1112\u1161\u11bc"); 669 characterSelectionData.add(" "); 670 characterSelectionData.add("\u1112\u1161\u11ab"); 671 characterSelectionData.add("\u110b\u1175\u11ab"); 672 characterSelectionData.add(" "); 673 characterSelectionData.add("\u110b\u1167\u11ab"); 674 characterSelectionData.add("\u1112\u1161\u11b8"); 675 characterSelectionData.add(" "); 676 characterSelectionData.add("\u110c\u1161\u11bc"); 677 characterSelectionData.add("\u1105\u1169"); 678 characterSelectionData.add("\u1100\u116d"); 679 characterSelectionData.add("\u1112\u116c"); 680 681 generalIteratorTest(characterBreak, characterSelectionData); 682 } 683 684 @Test 685 public void TestTitleBreak() 686 { 687 List<String> titleData = new ArrayList<String>(); 688 titleData.add(" "); 689 titleData.add("This "); 690 titleData.add("is "); 691 titleData.add("a "); 692 titleData.add("simple "); 693 titleData.add("sample "); 694 titleData.add("sentence. "); 695 titleData.add("This "); 696 697 generalIteratorTest(titleBreak, titleData); 698 } 699 700 701 702 /* 703 * @bug 4153072 704 */ 705 @Test 706 public void TestBug4153072() { 707 BreakIterator iter = BreakIterator.getWordInstance(); 708 String str = "...Hello, World!..."; 709 int begin = 3; 710 int end = str.length() - 3; 711 // not used boolean gotException = false; 712 713 714 iter.setText(new StringCharacterIterator(str, begin, end, begin)); 715 for (int index = -1; index < begin + 1; ++index) { 716 try { 717 iter.isBoundary(index); 718 if (index < begin) 719 errln("Didn't get exception with offset = " + index + 720 " and begin index = " + begin); 721 } 722 catch (IllegalArgumentException e) { 723 if (index >= begin) 724 errln("Got exception with offset = " + index + 725 " and begin index = " + begin); 726 } 727 } 728 } 729 730 731 @Test 732 public void TestBug4146175Lines() { 733 List<String> lineSelectionData = new ArrayList<String>(2); 734 735 // the fullwidth comma should stick to the preceding Japanese character 736 lineSelectionData.add("\u7d42\uff0c"); 737 lineSelectionData.add("\u308f"); 738 739 generalIteratorTest(lineBreak, lineSelectionData); 740 } 741 742 private static final String cannedTestChars 743 = "\u0000\u0001\u0002\u0003\u0004 !\"#$%&()+-01234<=>ABCDE[]^_`abcde{}|\u00a0\u00a2" 744 + "\u00a3\u00a4\u00a5\u00a6\u00a7\u00a8\u00a9\u00ab\u00ad\u00ae\u00af\u00b0\u00b2\u00b3" 745 + "\u00b4\u00b9\u00bb\u00bc\u00bd\u02b0\u02b1\u02b2\u02b3\u02b4\u0300\u0301\u0302\u0303" 746 + "\u0304\u05d0\u05d1\u05d2\u05d3\u05d4\u0903\u093e\u093f\u0940\u0949\u0f3a\u0f3b\u2000" 747 + "\u2001\u2002\u200c\u200d\u200e\u200f\u2010\u2011\u2012\u2028\u2029\u202a\u203e\u203f" 748 + "\u2040\u20dd\u20de\u20df\u20e0\u2160\u2161\u2162\u2163\u2164"; 749 750 @Test 751 public void TestSentenceInvariants() 752 { 753 BreakIterator e = BreakIterator.getSentenceInstance(); 754 doOtherInvariantTest(e, cannedTestChars + ".,\u3001\u3002\u3041\u3042\u3043\ufeff"); 755 } 756 757 @Test 758 public void TestEmptyString() 759 { 760 String text = ""; 761 List<String> x = new ArrayList<String>(1); 762 x.add(text); 763 764 generalIteratorTest(lineBreak, x); 765 } 766 767 @Test 768 public void TestGetAvailableLocales() 769 { 770 Locale[] locList = BreakIterator.getAvailableLocales(); 771 772 if (locList.length == 0) 773 errln("getAvailableLocales() returned an empty list!"); 774 // I have no idea how to test this function... 775 776 com.ibm.icu.util.ULocale[] ulocList = BreakIterator.getAvailableULocales(); 777 if (ulocList.length == 0) { 778 errln("getAvailableULocales() returned an empty list!"); 779 } else { 780 logln("getAvailableULocales() returned " + ulocList.length + " locales"); 781 } 782 } 783 784 785 /** 786 * @bug 4068137 787 */ 788 @Test 789 public void TestEndBehavior() 790 { 791 String testString = "boo."; 792 BreakIterator wb = BreakIterator.getWordInstance(); 793 wb.setText(testString); 794 795 if (wb.first() != 0) 796 errln("Didn't get break at beginning of string."); 797 if (wb.next() != 3) 798 errln("Didn't get break before period in \"boo.\""); 799 if (wb.current() != 4 && wb.next() != 4) 800 errln("Didn't get break at end of string."); 801 } 802 803 // The Following two tests are ported from ICU4C 1.8.1 [Richard/GCL] 804 /** 805 * Port From: ICU4C v1.8.1 : textbounds : IntlTestTextBoundary 806 * Source File: $ICU4CRoot/source/test/intltest/ittxtbd.cpp 807 **/ 808 /** 809 * test methods preceding, following and isBoundary 810 **/ 811 @Test 812 public void TestPreceding() { 813 String words3 = "aaa bbb ccc"; 814 BreakIterator e = BreakIterator.getWordInstance(Locale.getDefault()); 815 e.setText( words3 ); 816 e.first(); 817 int p1 = e.next(); 818 int p2 = e.next(); 819 int p3 = e.next(); 820 int p4 = e.next(); 821 822 int f = e.following(p2+1); 823 int p = e.preceding(p2+1); 824 if (f!=p3) 825 errln("IntlTestTextBoundary::TestPreceding: f!=p3"); 826 if (p!=p2) 827 errln("IntlTestTextBoundary::TestPreceding: p!=p2"); 828 829 if (p1+1!=p2) 830 errln("IntlTestTextBoundary::TestPreceding: p1+1!=p2"); 831 832 if (p3+1!=p4) 833 errln("IntlTestTextBoundary::TestPreceding: p3+1!=p4"); 834 835 if (!e.isBoundary(p2) || e.isBoundary(p2+1) || !e.isBoundary(p3)) 836 { 837 errln("IntlTestTextBoundary::TestPreceding: isBoundary err"); 838 } 839 } 840 841 842 /** 843 * Bug 4450804 844 */ 845 @Test 846 public void TestLineBreakContractions() { 847 List<String> expected = new ArrayList<String>(7); 848 expected.add("These "); 849 expected.add("are "); 850 expected.add("'foobles'. "); 851 expected.add("Don't "); 852 expected.add("you "); 853 expected.add("like "); 854 expected.add("them?"); 855 generalIteratorTest(lineBreak, expected); 856 } 857 858 /** 859 * Ticket#5615 860 */ 861 @Test 862 public void TestT5615() { 863 com.ibm.icu.util.ULocale[] ulocales = BreakIterator.getAvailableULocales(); 864 int type = 0; 865 com.ibm.icu.util.ULocale loc = null; 866 try { 867 for (int i = 0; i < ulocales.length; i++) { 868 loc = ulocales[i]; 869 for (type = 0; type < 5 /* 5 = BreakIterator.KIND_COUNT */; ++type) { 870 BreakIterator brk = BreakIterator.getBreakInstance(loc, type); 871 if (brk == null) { 872 errln("ERR: Failed to create an instance type: " + type + " / locale: " + loc); 873 } 874 } 875 } 876 } catch (Exception e) { 877 errln("ERR: Failed to create an instance type: " + type + " / locale: " + loc + " / exception: " + e.getMessage()); 878 } 879 } 880 881 /** 882 * At present, Japanese doesn't have exceptions. 883 * However, this still should not fail. 884 */ 885 @Test 886 public void TestFilteredJapanese() { 887 ULocale loc = ULocale.JAPANESE; 888 BreakIterator brk = FilteredBreakIteratorBuilder 889 .createInstance(loc) 890 .build(BreakIterator.getSentenceInstance(loc)); 891 brk.setText(""); 892 assertEquals("Starting point", 0, brk.current()); 893 assertEquals("Next point", 5, brk.next()); 894 assertEquals("Last point", BreakIterator.DONE, brk.next()); 895 } 896 897 /* 898 * Test case for Ticket#10721. BreakIterator factory method should throw NPE 899 * when specified locale is null. 900 */ 901 @Test 902 public void TestNullLocale() { 903 Locale loc = null; 904 ULocale uloc = null; 905 906 @SuppressWarnings("unused") 907 BreakIterator brk; 908 909 // Character 910 try { 911 brk = BreakIterator.getCharacterInstance(loc); 912 errln("getCharacterInstance((Locale)null) did not throw NPE."); 913 } catch (NullPointerException e) { /* OK */ } 914 try { 915 brk = BreakIterator.getCharacterInstance(uloc); 916 errln("getCharacterInstance((ULocale)null) did not throw NPE."); 917 } catch (NullPointerException e) { /* OK */ } 918 919 // Line 920 try { 921 brk = BreakIterator.getLineInstance(loc); 922 errln("getLineInstance((Locale)null) did not throw NPE."); 923 } catch (NullPointerException e) { /* OK */ } 924 try { 925 brk = BreakIterator.getLineInstance(uloc); 926 errln("getLineInstance((ULocale)null) did not throw NPE."); 927 } catch (NullPointerException e) { /* OK */ } 928 929 // Sentence 930 try { 931 brk = BreakIterator.getSentenceInstance(loc); 932 errln("getSentenceInstance((Locale)null) did not throw NPE."); 933 } catch (NullPointerException e) { /* OK */ } 934 try { 935 brk = BreakIterator.getSentenceInstance(uloc); 936 errln("getSentenceInstance((ULocale)null) did not throw NPE."); 937 } catch (NullPointerException e) { /* OK */ } 938 939 // Title 940 try { 941 brk = BreakIterator.getTitleInstance(loc); 942 errln("getTitleInstance((Locale)null) did not throw NPE."); 943 } catch (NullPointerException e) { /* OK */ } 944 try { 945 brk = BreakIterator.getTitleInstance(uloc); 946 errln("getTitleInstance((ULocale)null) did not throw NPE."); 947 } catch (NullPointerException e) { /* OK */ } 948 949 // Word 950 try { 951 brk = BreakIterator.getWordInstance(loc); 952 errln("getWordInstance((Locale)null) did not throw NPE."); 953 } catch (NullPointerException e) { /* OK */ } 954 try { 955 brk = BreakIterator.getWordInstance(uloc); 956 errln("getWordInstance((ULocale)null) did not throw NPE."); 957 } catch (NullPointerException e) { /* OK */ } 958 } 959 960 /** 961 * Test FilteredBreakIteratorBuilder newly introduced 962 */ 963 @Test 964 public void TestFilteredBreakIteratorBuilder() { 965 FilteredBreakIteratorBuilder builder; 966 BreakIterator baseBI; 967 BreakIterator filteredBI; 968 969 String text = "In the meantime Mr. Weston arrived with his small ship, which he had now recovered. Capt. Gorges, who informed the Sgt. here that one purpose of his going east was to meet with Mr. Weston, took this opportunity to call him to account for some abuses he had to lay to his charge."; // (William Bradford, public domain. http://catalog.hathitrust.org/Record/008651224 ) - edited. 970 String ABBR_MR = "Mr."; 971 String ABBR_CAPT = "Capt."; 972 973 { 974 logln("Constructing empty builder\n"); 975 builder = FilteredBreakIteratorBuilder.createInstance(); 976 977 logln("Constructing base BI\n"); 978 baseBI = BreakIterator.getSentenceInstance(Locale.ENGLISH); 979 980 logln("Building new BI\n"); 981 filteredBI = builder.build(baseBI); 982 983 assertDefaultBreakBehavior(filteredBI, text); 984 } 985 986 { 987 logln("Constructing empty builder\n"); 988 builder = FilteredBreakIteratorBuilder.createInstance(); 989 990 logln("Adding Mr. as an exception\n"); 991 992 assertEquals("2.1 suppressBreakAfter", true, builder.suppressBreakAfter(ABBR_MR)); 993 assertEquals("2.2 suppressBreakAfter", false, builder.suppressBreakAfter(ABBR_MR)); 994 assertEquals("2.3 unsuppressBreakAfter", true, builder.unsuppressBreakAfter(ABBR_MR)); 995 assertEquals("2.4 unsuppressBreakAfter", false, builder.unsuppressBreakAfter(ABBR_MR)); 996 assertEquals("2.5 suppressBreakAfter", true, builder.suppressBreakAfter(ABBR_MR)); 997 998 logln("Constructing base BI\n"); 999 baseBI = BreakIterator.getSentenceInstance(Locale.ENGLISH); 1000 1001 logln("Building new BI\n"); 1002 filteredBI = builder.build(baseBI); 1003 1004 logln("Testing:"); 1005 filteredBI.setText(text); 1006 assertEquals("2nd next", 84, filteredBI.next()); 1007 assertEquals("2nd next", 90, filteredBI.next()); 1008 assertEquals("2nd next", 278, filteredBI.next()); 1009 filteredBI.first(); 1010 } 1011 1012 1013 { 1014 logln("Constructing empty builder\n"); 1015 builder = FilteredBreakIteratorBuilder.createInstance(); 1016 1017 logln("Adding Mr. and Capt as an exception\n"); 1018 assertEquals("3.1 suppressBreakAfter", true, builder.suppressBreakAfter(ABBR_MR)); 1019 assertEquals("3.2 suppressBreakAfter", true, builder.suppressBreakAfter(ABBR_CAPT)); 1020 1021 logln("Constructing base BI\n"); 1022 baseBI = BreakIterator.getSentenceInstance(Locale.ENGLISH); 1023 1024 logln("Building new BI\n"); 1025 filteredBI = builder.build(baseBI); 1026 1027 logln("Testing:"); 1028 filteredBI.setText(text); 1029 assertEquals("3rd next", 84, filteredBI.next()); 1030 assertEquals("3rd next", 278, filteredBI.next()); 1031 filteredBI.first(); 1032 } 1033 1034 { 1035 logln("Constructing English builder\n"); 1036 builder = FilteredBreakIteratorBuilder.createInstance(ULocale.ENGLISH); 1037 1038 logln("Constructing base BI\n"); 1039 baseBI = BreakIterator.getSentenceInstance(Locale.ENGLISH); 1040 1041 logln("unsuppressing 'Capt'"); 1042 assertEquals("1st suppressBreakAfter", true, builder.unsuppressBreakAfter(ABBR_CAPT)); 1043 1044 logln("Building new BI\n"); 1045 filteredBI = builder.build(baseBI); 1046 1047 if(filteredBI != null) { 1048 logln("Testing:"); 1049 filteredBI.setText(text); 1050 assertEquals("4th next", 84, filteredBI.next()); 1051 assertEquals("4th next", 90, filteredBI.next()); 1052 assertEquals("4th next", 278, filteredBI.next()); 1053 filteredBI.first(); 1054 } 1055 } 1056 1057 { 1058 logln("Constructing English builder\n"); 1059 builder = FilteredBreakIteratorBuilder.createInstance(ULocale.ENGLISH); 1060 1061 logln("Constructing base BI\n"); 1062 baseBI = BreakIterator.getSentenceInstance(Locale.ENGLISH); 1063 1064 logln("Building new BI\n"); 1065 filteredBI = builder.build(baseBI); 1066 1067 if(filteredBI != null) { 1068 assertEnglishBreakBehavior(filteredBI, text); 1069 } 1070 } 1071 1072 { 1073 logln("Constructing English @ss=standard\n"); 1074 filteredBI = BreakIterator.getSentenceInstance(ULocale.forLanguageTag("en-US-u-ss-standard")); 1075 1076 if(filteredBI != null) { 1077 assertEnglishBreakBehavior(filteredBI, text); 1078 } 1079 } 1080 1081 { 1082 logln("Constructing Afrikaans @ss=standard - should be == default\n"); 1083 filteredBI = BreakIterator.getSentenceInstance(ULocale.forLanguageTag("af-u-ss-standard")); 1084 1085 assertDefaultBreakBehavior(filteredBI, text); 1086 } 1087 1088 { 1089 logln("Constructing Japanese @ss=standard - should be == default\n"); 1090 filteredBI = BreakIterator.getSentenceInstance(ULocale.forLanguageTag("ja-u-ss-standard")); 1091 1092 assertDefaultBreakBehavior(filteredBI, text); 1093 } 1094 { 1095 logln("Constructing tfg @ss=standard - should be == default\n"); 1096 filteredBI = BreakIterator.getSentenceInstance(ULocale.forLanguageTag("tfg-u-ss-standard")); 1097 1098 assertDefaultBreakBehavior(filteredBI, text); 1099 } 1100 1101 { 1102 logln("Constructing French builder"); 1103 builder = FilteredBreakIteratorBuilder.createInstance(ULocale.FRENCH); 1104 1105 logln("Constructing base BI\n"); 1106 baseBI = BreakIterator.getSentenceInstance(Locale.FRENCH); 1107 1108 logln("Building new BI\n"); 1109 filteredBI = builder.build(baseBI); 1110 1111 if(filteredBI != null) { 1112 assertFrenchBreakBehavior(filteredBI, text); 1113 } 1114 } 1115 } 1116 1117 /** 1118 * @param filteredBI 1119 * @param text 1120 */ 1121 private void assertFrenchBreakBehavior(BreakIterator filteredBI, String text) { 1122 logln("Testing French behavior:"); 1123 filteredBI.setText(text); 1124 assertEquals("6th next", 20, filteredBI.next()); 1125 assertEquals("6th next", 84, filteredBI.next()); 1126 filteredBI.first(); 1127 } 1128 1129 /** 1130 * @param filteredBI 1131 * @param text 1132 */ 1133 private void assertEnglishBreakBehavior(BreakIterator filteredBI, String text) { 1134 logln("Testing English filtered behavior:"); 1135 filteredBI.setText(text); 1136 1137 assertEquals("5th next", 84, filteredBI.next()); 1138 assertEquals("5th next", 278, filteredBI.next()); 1139 filteredBI.first(); 1140 } 1141 1142 /** 1143 * @param filteredBI 1144 * @param text 1145 */ 1146 private void assertDefaultBreakBehavior(BreakIterator filteredBI, String text) { 1147 logln("Testing Default Behavior:"); 1148 filteredBI.setText(text); 1149 assertEquals("1st next", 20, filteredBI.next()); 1150 assertEquals("1st next", 84, filteredBI.next()); 1151 assertEquals("1st next", 90, filteredBI.next()); 1152 assertEquals("1st next", 181, filteredBI.next()); 1153 assertEquals("1st next", 278, filteredBI.next()); 1154 filteredBI.first(); 1155 } 1156 } 1157