Home | History | Annotate | Download | only in rbbi
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4  *******************************************************************************
      5  * Copyright (C) 1996-2016, International Business Machines Corporation and    *
      6  * others. All Rights Reserved.                                                *
      7  *******************************************************************************
      8  */
      9 package com.ibm.icu.dev.test.rbbi;
     10 
     11 import java.text.StringCharacterIterator;
     12 import java.util.ArrayList;
     13 import java.util.List;
     14 import java.util.Locale;
     15 
     16 import org.junit.Before;
     17 import org.junit.Test;
     18 
     19 import com.ibm.icu.dev.test.TestFmwk;
     20 import com.ibm.icu.text.BreakIterator;
     21 import com.ibm.icu.text.FilteredBreakIteratorBuilder;
     22 import com.ibm.icu.util.ULocale;
     23 
     24 public class BreakIteratorTest extends TestFmwk
     25 {
     26     private BreakIterator characterBreak;
     27     private BreakIterator wordBreak;
     28     private BreakIterator lineBreak;
     29     private BreakIterator sentenceBreak;
     30     private BreakIterator titleBreak;
     31 
     32     public BreakIteratorTest()
     33     {
     34 
     35     }
     36 
     37     @Before
     38     public void init(){
     39         characterBreak = BreakIterator.getCharacterInstance();
     40         wordBreak = BreakIterator.getWordInstance();
     41         lineBreak = BreakIterator.getLineInstance();
     42         //logln("Creating sentence iterator...");
     43         sentenceBreak = BreakIterator.getSentenceInstance();
     44         //logln("Finished creating sentence iterator...");
     45         titleBreak = BreakIterator.getTitleInstance();
     46     }
     47     //=========================================================================
     48     // general test subroutines
     49     //=========================================================================
     50 
     51     private void generalIteratorTest(BreakIterator bi, List<String> expectedResult) {
     52         StringBuffer buffer = new StringBuffer();
     53         String text;
     54         for (int i = 0; i < expectedResult.size(); i++) {
     55             text = expectedResult.get(i);
     56             buffer.append(text);
     57         }
     58         text = buffer.toString();
     59 
     60         bi.setText(text);
     61 
     62         List<String> nextResults = _testFirstAndNext(bi, text);
     63         List<String> previousResults = _testLastAndPrevious(bi, text);
     64 
     65         logln("comparing forward and backward...");
     66         //TODO(junit) - needs to be rewritten
     67         //int errs = getErrorCount();
     68         compareFragmentLists("forward iteration", "backward iteration", nextResults,
     69                         previousResults);
     70         //if (getErrorCount() == errs) {
     71         logln("comparing expected and actual...");
     72         compareFragmentLists("expected result", "actual result", expectedResult,
     73                         nextResults);
     74         logln("comparing expected and actual...");
     75         compareFragmentLists("expected result", "actual result", expectedResult,
     76                             nextResults);
     77         //}
     78 
     79         int[] boundaries = new int[expectedResult.size() + 3];
     80         boundaries[0] = BreakIterator.DONE;
     81         boundaries[1] = 0;
     82         for (int i = 0; i < expectedResult.size(); i++)
     83             boundaries[i + 2] = boundaries[i + 1] + (expectedResult.get(i)).
     84                             length();
     85         boundaries[boundaries.length - 1] = BreakIterator.DONE;
     86 
     87         _testFollowing(bi, text, boundaries);
     88         _testPreceding(bi, text, boundaries);
     89         _testIsBoundary(bi, text, boundaries);
     90 
     91         doMultipleSelectionTest(bi, text);
     92     }
     93 
     94     private List<String> _testFirstAndNext(BreakIterator bi, String text) {
     95         int p = bi.first();
     96         int lastP = p;
     97         List<String> result = new ArrayList<String>();
     98 
     99         if (p != 0)
    100             errln("first() returned " + p + " instead of 0");
    101         while (p != BreakIterator.DONE) {
    102             p = bi.next();
    103             if (p != BreakIterator.DONE) {
    104                 if (p <= lastP)
    105                     errln("next() failed to move forward: next() on position "
    106                                     + lastP + " yielded " + p);
    107 
    108                 result.add(text.substring(lastP, p));
    109             }
    110             else {
    111                 if (lastP != text.length())
    112                     errln("next() returned DONE prematurely: offset was "
    113                                     + lastP + " instead of " + text.length());
    114             }
    115             lastP = p;
    116         }
    117         return result;
    118     }
    119 
    120     private List<String> _testLastAndPrevious(BreakIterator bi, String text) {
    121         int p = bi.last();
    122         int lastP = p;
    123         List<String> result = new ArrayList<String>();
    124 
    125         if (p != text.length())
    126             errln("last() returned " + p + " instead of " + text.length());
    127         while (p != BreakIterator.DONE) {
    128             p = bi.previous();
    129             if (p != BreakIterator.DONE) {
    130                 if (p >= lastP)
    131                     errln("previous() failed to move backward: previous() on position "
    132                                     + lastP + " yielded " + p);
    133 
    134                 result.add(0, text.substring(p, lastP));
    135             }
    136             else {
    137                 if (lastP != 0)
    138                     errln("previous() returned DONE prematurely: offset was "
    139                                     + lastP + " instead of 0");
    140             }
    141             lastP = p;
    142         }
    143         return result;
    144     }
    145 
    146     private void compareFragmentLists(String f1Name, String f2Name, List<String> f1, List<String> f2) {
    147         int p1 = 0;
    148         int p2 = 0;
    149         String s1;
    150         String s2;
    151         int t1 = 0;
    152         int t2 = 0;
    153 
    154         while (p1 < f1.size() && p2 < f2.size()) {
    155             s1 = f1.get(p1);
    156             s2 = f2.get(p2);
    157             t1 += s1.length();
    158             t2 += s2.length();
    159 
    160             if (s1.equals(s2)) {
    161                 debugLogln("   >" + s1 + "<");
    162                 ++p1;
    163                 ++p2;
    164             }
    165             else {
    166                 int tempT1 = t1;
    167                 int tempT2 = t2;
    168                 int tempP1 = p1;
    169                 int tempP2 = p2;
    170 
    171                 while (tempT1 != tempT2 && tempP1 < f1.size() && tempP2 < f2.size()) {
    172                     while (tempT1 < tempT2 && tempP1 < f1.size()) {
    173                         tempT1 += (f1.get(tempP1)).length();
    174                         ++tempP1;
    175                     }
    176                     while (tempT2 < tempT1 && tempP2 < f2.size()) {
    177                         tempT2 += (f2.get(tempP2)).length();
    178                         ++tempP2;
    179                     }
    180                 }
    181                 logln("*** " + f1Name + " has:");
    182                 while (p1 <= tempP1 && p1 < f1.size()) {
    183                     s1 = f1.get(p1);
    184                     t1 += s1.length();
    185                     debugLogln(" *** >" + s1 + "<");
    186                     ++p1;
    187                 }
    188                 logln("***** " + f2Name + " has:");
    189                 while (p2 <= tempP2 && p2 < f2.size()) {
    190                     s2 = f2.get(p2);
    191                     t2 += s2.length();
    192                     debugLogln(" ***** >" + s2 + "<");
    193                     ++p2;
    194                 }
    195                 errln("Discrepancy between " + f1Name + " and " + f2Name);
    196             }
    197         }
    198     }
    199 
    200     private void _testFollowing(BreakIterator bi, String text, int[] boundaries) {
    201         logln("testFollowing():");
    202         int p = 2;
    203         for (int i = 0; i <= text.length(); i++) {
    204             if (i == boundaries[p])
    205                 ++p;
    206 
    207             int b = bi.following(i);
    208             logln("bi.following(" + i + ") -> " + b);
    209             if (b != boundaries[p])
    210                 errln("Wrong result from following() for " + i + ": expected " + boundaries[p]
    211                                 + ", got " + b);
    212         }
    213     }
    214 
    215     private void _testPreceding(BreakIterator bi, String text, int[] boundaries) {
    216         logln("testPreceding():");
    217         int p = 0;
    218         for (int i = 0; i <= text.length(); i++) {
    219             int b = bi.preceding(i);
    220             logln("bi.preceding(" + i + ") -> " + b);
    221             if (b != boundaries[p])
    222                 errln("Wrong result from preceding() for " + i + ": expected " + boundaries[p]
    223                                 + ", got " + b);
    224 
    225             if (i == boundaries[p + 1])
    226                 ++p;
    227         }
    228     }
    229 
    230     private void _testIsBoundary(BreakIterator bi, String text, int[] boundaries) {
    231         logln("testIsBoundary():");
    232         int p = 1;
    233         boolean isB;
    234         for (int i = 0; i <= text.length(); i++) {
    235             isB = bi.isBoundary(i);
    236             logln("bi.isBoundary(" + i + ") -> " + isB);
    237 
    238             if (i == boundaries[p]) {
    239                 if (!isB)
    240                     errln("Wrong result from isBoundary() for " + i + ": expected true, got false");
    241                 ++p;
    242             }
    243             else {
    244                 if (isB)
    245                     errln("Wrong result from isBoundary() for " + i + ": expected false, got true");
    246             }
    247         }
    248     }
    249 
    250     private void doMultipleSelectionTest(BreakIterator iterator, String testText)
    251     {
    252         logln("Multiple selection test...");
    253         BreakIterator testIterator = (BreakIterator)iterator.clone();
    254         int offset = iterator.first();
    255         int testOffset;
    256         int count = 0;
    257 
    258         do {
    259             testOffset = testIterator.first();
    260             testOffset = testIterator.next(count);
    261             logln("next(" + count + ") -> " + testOffset);
    262             if (offset != testOffset)
    263                 errln("next(n) and next() not returning consistent results: for step " + count + ", next(n) returned " + testOffset + " and next() had " + offset);
    264 
    265             if (offset != BreakIterator.DONE) {
    266                 count++;
    267                 offset = iterator.next();
    268             }
    269         } while (offset != BreakIterator.DONE);
    270 
    271         // now do it backwards...
    272         offset = iterator.last();
    273         count = 0;
    274 
    275         do {
    276             testOffset = testIterator.last();
    277             testOffset = testIterator.next(count);
    278             logln("next(" + count + ") -> " + testOffset);
    279             if (offset != testOffset)
    280                 errln("next(n) and next() not returning consistent results: for step " + count + ", next(n) returned " + testOffset + " and next() had " + offset);
    281 
    282             if (offset != BreakIterator.DONE) {
    283                 count--;
    284                 offset = iterator.previous();
    285             }
    286         } while (offset != BreakIterator.DONE);
    287     }
    288 
    289 
    290     private void doOtherInvariantTest(BreakIterator tb, String testChars)
    291     {
    292         StringBuffer work = new StringBuffer("a\r\na");
    293         int errorCount = 0;
    294 
    295         // a break should never occur between CR and LF
    296         for (int i = 0; i < testChars.length(); i++) {
    297             work.setCharAt(0, testChars.charAt(i));
    298             for (int j = 0; j < testChars.length(); j++) {
    299                 work.setCharAt(3, testChars.charAt(j));
    300                 tb.setText(work.toString());
    301                 for (int k = tb.first(); k != BreakIterator.DONE; k = tb.next())
    302                     if (k == 2) {
    303                         errln("Break between CR and LF in string U+" + Integer.toHexString(
    304                                 (work.charAt(0))) + ", U+d U+a U+" + Integer.toHexString(
    305                                 (work.charAt(3))));
    306                         errorCount++;
    307                         if (errorCount >= 75)
    308                             return;
    309                     }
    310             }
    311         }
    312 
    313         // a break should never occur before a non-spacing mark, unless it's preceded
    314         // by a line terminator
    315         work.setLength(0);
    316         work.append("aaaa");
    317         for (int i = 0; i < testChars.length(); i++) {
    318             char c = testChars.charAt(i);
    319             if (c == '\n' || c == '\r' || c == '\u2029' || c == '\u2028' || c == '\u0003')
    320                 continue;
    321             work.setCharAt(1, c);
    322             for (int j = 0; j < testChars.length(); j++) {
    323                 c = testChars.charAt(j);
    324                 if (Character.getType(c) != Character.NON_SPACING_MARK && Character.getType(c)
    325                         != Character.ENCLOSING_MARK)
    326                     continue;
    327                 work.setCharAt(2, c);
    328                 tb.setText(work.toString());
    329                 for (int k = tb.first(); k != BreakIterator.DONE; k = tb.next())
    330                     if (k == 2) {
    331                         errln("Break between U+" + Integer.toHexString((work.charAt(1)))
    332                                 + " and U+" + Integer.toHexString((work.charAt(2))));
    333                         errorCount++;
    334                         if (errorCount >= 75)
    335                             return;
    336                     }
    337             }
    338         }
    339     }
    340 
    341     public void debugLogln(String s) {
    342         final String zeros = "0000";
    343         String temp;
    344         StringBuffer out = new StringBuffer();
    345         for (int i = 0; i < s.length(); i++) {
    346             char c = s.charAt(i);
    347             if (c >= ' ' && c < '\u007f')
    348                 out.append(c);
    349             else {
    350                 out.append("\\u");
    351                 temp = Integer.toHexString(c);
    352                 out.append(zeros.substring(0, 4 - temp.length()));
    353                 out.append(temp);
    354             }
    355         }
    356         logln(out.toString());
    357     }
    358 
    359     //=========================================================================
    360     // tests
    361     //=========================================================================
    362 
    363 
    364     /**
    365      * @bug 4097779
    366      */
    367     @Test
    368     public void TestBug4097779() {
    369         List<String> wordSelectionData = new ArrayList<String>(2);
    370 
    371         wordSelectionData.add("aa\u0300a");
    372         wordSelectionData.add(" ");
    373 
    374         generalIteratorTest(wordBreak, wordSelectionData);
    375     }
    376 
    377     /**
    378      * @bug 4098467
    379      */
    380     @Test
    381     public void TestBug4098467Words() {
    382         List<String> wordSelectionData = new ArrayList<String>();
    383 
    384         // What follows is a string of Korean characters (I found it in the Yellow Pages
    385         // ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
    386         // it correctly), first as precomposed syllables, and then as conjoining jamo.
    387         // Both sequences should be semantically identical and break the same way.
    388         // precomposed syllables...
    389         wordSelectionData.add("\uc0c1\ud56d");
    390         wordSelectionData.add(" ");
    391         wordSelectionData.add("\ud55c\uc778");
    392         wordSelectionData.add(" ");
    393         wordSelectionData.add("\uc5f0\ud569");
    394         wordSelectionData.add(" ");
    395         wordSelectionData.add("\uc7a5\ub85c\uad50\ud68c");
    396         wordSelectionData.add(" ");
    397         // conjoining jamo...
    398         wordSelectionData.add("\u1109\u1161\u11bc\u1112\u1161\u11bc");
    399         wordSelectionData.add(" ");
    400         wordSelectionData.add("\u1112\u1161\u11ab\u110b\u1175\u11ab");
    401         wordSelectionData.add(" ");
    402         wordSelectionData.add("\u110b\u1167\u11ab\u1112\u1161\u11b8");
    403         wordSelectionData.add(" ");
    404         wordSelectionData.add("\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c");
    405         wordSelectionData.add(" ");
    406 
    407         generalIteratorTest(wordBreak, wordSelectionData);
    408     }
    409 
    410 
    411     /**
    412      * @bug 4111338
    413      */
    414     @Test
    415     public void TestBug4111338() {
    416         List<String> sentenceSelectionData = new ArrayList<String>();
    417 
    418         // test for bug #4111338: Don't break sentences at the boundary between CJK
    419         // and other letters
    420         sentenceSelectionData.add("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165:\"JAVA\u821c"
    421                 + "\u8165\u7fc8\u51ce\u306d,\u2494\u56d8\u4ec0\u60b1\u8560\u51ba"
    422                 + "\u611d\u57b6\u2510\u5d46\".\u2029");
    423         sentenceSelectionData.add("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8"
    424                 + "\u97e4JAVA\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0"
    425                 + "\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2029");
    426         sentenceSelectionData.add("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8\u97e4"
    427                 + "\u6470\u8790JAVA\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8"
    428                 + "\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2029");
    429         sentenceSelectionData.add("He said, \"I can go there.\"\u2029");
    430 
    431         generalIteratorTest(sentenceBreak, sentenceSelectionData);
    432     }
    433 
    434 
    435     /**
    436      * @bug 4143071
    437      */
    438     @Test
    439     public void TestBug4143071() {
    440         List<String> sentenceSelectionData = new ArrayList<String>(3);
    441 
    442         // Make sure sentences that end with digits work right
    443         sentenceSelectionData.add("Today is the 27th of May, 1998.  ");
    444         sentenceSelectionData.add("Tomorrow will be 28 May 1998.  ");
    445         sentenceSelectionData.add("The day after will be the 30th.\u2029");
    446 
    447         generalIteratorTest(sentenceBreak, sentenceSelectionData);
    448     }
    449 
    450     /**
    451      * @bug 4152416
    452      */
    453     @Test
    454     public void TestBug4152416() {
    455         List<String> sentenceSelectionData = new ArrayList<String>(2);
    456 
    457         // Make sure sentences ending with a capital letter are treated correctly
    458         sentenceSelectionData.add("The type of all primitive "
    459                 + "<code>boolean</code> values accessed in the target VM.  ");
    460         sentenceSelectionData.add("Calls to xxx will return an "
    461                 + "implementor of this interface.\u2029");
    462 
    463         generalIteratorTest(sentenceBreak, sentenceSelectionData);
    464     }
    465 
    466     /**
    467      * @bug 4152117
    468      */
    469     @Test
    470     public void TestBug4152117() {
    471         List<String> sentenceSelectionData = new ArrayList<String>(3);
    472 
    473         // Make sure sentence breaking is handling punctuation correctly
    474         // [COULD NOT REPRODUCE THIS BUG, BUT TEST IS HERE TO MAKE SURE
    475         // IT DOESN'T CROP UP]
    476         sentenceSelectionData.add("Constructs a randomly generated "
    477                 + "BigInteger, uniformly distributed over the range <tt>0</tt> "
    478                 + "to <tt>(2<sup>numBits</sup> - 1)</tt>, inclusive.  ");
    479         sentenceSelectionData.add("The uniformity of the distribution "
    480                 + "assumes that a fair source of random bits is provided in "
    481                 + "<tt>rnd</tt>.  ");
    482         sentenceSelectionData.add("Note that this constructor always "
    483                 + "constructs a non-negative BigInteger.\u2029");
    484 
    485         generalIteratorTest(sentenceBreak, sentenceSelectionData);
    486     }
    487 
    488     @Test
    489     public void TestLineBreak() {
    490         List<String> lineSelectionData = new ArrayList<String>();
    491 
    492         lineSelectionData.add("Multi-");
    493         lineSelectionData.add("Level ");
    494         lineSelectionData.add("example ");
    495         lineSelectionData.add("of ");
    496         lineSelectionData.add("a ");
    497         lineSelectionData.add("semi-");
    498         lineSelectionData.add("idiotic ");
    499         lineSelectionData.add("non-");
    500         lineSelectionData.add("sensical ");
    501         lineSelectionData.add("(non-");
    502         lineSelectionData.add("important) ");
    503         lineSelectionData.add("sentence. ");
    504 
    505         lineSelectionData.add("Hi  ");
    506         lineSelectionData.add("Hello ");
    507         lineSelectionData.add("How\n");
    508         lineSelectionData.add("are\r");
    509         lineSelectionData.add("you\u2028");
    510         lineSelectionData.add("fine.\t");
    511         lineSelectionData.add("good.  ");
    512 
    513         lineSelectionData.add("Now\r");
    514         lineSelectionData.add("is\n");
    515         lineSelectionData.add("the\r\n");
    516         lineSelectionData.add("time\n");
    517         lineSelectionData.add("\r");
    518         lineSelectionData.add("for\r");
    519         lineSelectionData.add("\r");
    520         lineSelectionData.add("all");
    521 
    522         generalIteratorTest(lineBreak, lineSelectionData);
    523     }
    524 
    525     /**
    526      * @bug 4068133
    527      */
    528     @Test
    529     public void TestBug4068133() {
    530         List<String> lineSelectionData = new ArrayList<String>(9);
    531 
    532         lineSelectionData.add("\u96f6");
    533         lineSelectionData.add("\u4e00\u3002");
    534         lineSelectionData.add("\u4e8c\u3001");
    535         lineSelectionData.add("\u4e09\u3002\u3001");
    536         lineSelectionData.add("\u56db\u3001\u3002\u3001");
    537         lineSelectionData.add("\u4e94,");
    538         lineSelectionData.add("\u516d.");
    539         lineSelectionData.add("\u4e03.\u3001,\u3002");
    540         lineSelectionData.add("\u516b");
    541 
    542         generalIteratorTest(lineBreak, lineSelectionData);
    543     }
    544 
    545     /**
    546      * @bug 4086052
    547      */
    548     @Test
    549     public void TestBug4086052() {
    550         List<String> lineSelectionData = new ArrayList<String>(1);
    551 
    552         lineSelectionData.add("foo\u00a0bar ");
    553 //        lineSelectionData.addElement("foo\ufeffbar");
    554 
    555         generalIteratorTest(lineBreak, lineSelectionData);
    556     }
    557 
    558     /**
    559      * @bug 4097920
    560      */
    561     @Test
    562     public void TestBug4097920() {
    563         List<String> lineSelectionData = new ArrayList<String>(3);
    564 
    565         lineSelectionData.add("dog,cat,mouse ");
    566         lineSelectionData.add("(one)");
    567         lineSelectionData.add("(two)\n");
    568         generalIteratorTest(lineBreak, lineSelectionData);
    569     }
    570 
    571 
    572 
    573     /**
    574      * @bug 4117554
    575      */
    576     @Test
    577     public void TestBug4117554Lines() {
    578         List<String> lineSelectionData = new ArrayList<String>(3);
    579 
    580         // Fullwidth .!? should be treated as postJwrd
    581         lineSelectionData.add("\u4e01\uff0e");
    582         lineSelectionData.add("\u4e02\uff01");
    583         lineSelectionData.add("\u4e03\uff1f");
    584 
    585         generalIteratorTest(lineBreak, lineSelectionData);
    586     }
    587 
    588     @Test
    589     public void TestLettersAndDigits() {
    590         // a character sequence such as "X11" or "30F3" or "native2ascii" should
    591         // be kept together as a single word
    592         List<String> lineSelectionData = new ArrayList<String>(3);
    593 
    594         lineSelectionData.add("X11 ");
    595         lineSelectionData.add("30F3 ");
    596         lineSelectionData.add("native2ascii");
    597 
    598         generalIteratorTest(lineBreak, lineSelectionData);
    599     }
    600 
    601 
    602     private static final String graveS = "S\u0300";
    603     private static final String acuteBelowI = "i\u0317";
    604     private static final String acuteE = "e\u0301";
    605     private static final String circumflexA = "a\u0302";
    606     private static final String tildeE = "e\u0303";
    607 
    608     @Test
    609     public void TestCharacterBreak() {
    610         List<String> characterSelectionData = new ArrayList<String>();
    611 
    612         characterSelectionData.add(graveS);
    613         characterSelectionData.add(acuteBelowI);
    614         characterSelectionData.add("m");
    615         characterSelectionData.add("p");
    616         characterSelectionData.add("l");
    617         characterSelectionData.add(acuteE);
    618         characterSelectionData.add(" ");
    619         characterSelectionData.add("s");
    620         characterSelectionData.add(circumflexA);
    621         characterSelectionData.add("m");
    622         characterSelectionData.add("p");
    623         characterSelectionData.add("l");
    624         characterSelectionData.add(tildeE);
    625         characterSelectionData.add(".");
    626         characterSelectionData.add("w");
    627         characterSelectionData.add(circumflexA);
    628         characterSelectionData.add("w");
    629         characterSelectionData.add("a");
    630         characterSelectionData.add("f");
    631         characterSelectionData.add("q");
    632         characterSelectionData.add("\n");
    633         characterSelectionData.add("\r");
    634         characterSelectionData.add("\r\n");
    635         characterSelectionData.add("\n");
    636 
    637         generalIteratorTest(characterBreak, characterSelectionData);
    638     }
    639 
    640     /**
    641      * @bug 4098467
    642      */
    643     @Test
    644     public void TestBug4098467Characters() {
    645         List<String> characterSelectionData = new ArrayList<String>();
    646 
    647         // What follows is a string of Korean characters (I found it in the Yellow Pages
    648         // ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
    649         // it correctly), first as precomposed syllables, and then as conjoining jamo.
    650         // Both sequences should be semantically identical and break the same way.
    651         // precomposed syllables...
    652         characterSelectionData.add("\uc0c1");
    653         characterSelectionData.add("\ud56d");
    654         characterSelectionData.add(" ");
    655         characterSelectionData.add("\ud55c");
    656         characterSelectionData.add("\uc778");
    657         characterSelectionData.add(" ");
    658         characterSelectionData.add("\uc5f0");
    659         characterSelectionData.add("\ud569");
    660         characterSelectionData.add(" ");
    661         characterSelectionData.add("\uc7a5");
    662         characterSelectionData.add("\ub85c");
    663         characterSelectionData.add("\uad50");
    664         characterSelectionData.add("\ud68c");
    665         characterSelectionData.add(" ");
    666         // conjoining jamo...
    667         characterSelectionData.add("\u1109\u1161\u11bc");
    668         characterSelectionData.add("\u1112\u1161\u11bc");
    669         characterSelectionData.add(" ");
    670         characterSelectionData.add("\u1112\u1161\u11ab");
    671         characterSelectionData.add("\u110b\u1175\u11ab");
    672         characterSelectionData.add(" ");
    673         characterSelectionData.add("\u110b\u1167\u11ab");
    674         characterSelectionData.add("\u1112\u1161\u11b8");
    675         characterSelectionData.add(" ");
    676         characterSelectionData.add("\u110c\u1161\u11bc");
    677         characterSelectionData.add("\u1105\u1169");
    678         characterSelectionData.add("\u1100\u116d");
    679         characterSelectionData.add("\u1112\u116c");
    680 
    681         generalIteratorTest(characterBreak, characterSelectionData);
    682     }
    683 
    684     @Test
    685     public void TestTitleBreak()
    686     {
    687         List<String> titleData = new ArrayList<String>();
    688         titleData.add("   ");
    689         titleData.add("This ");
    690         titleData.add("is ");
    691         titleData.add("a ");
    692         titleData.add("simple ");
    693         titleData.add("sample ");
    694         titleData.add("sentence. ");
    695         titleData.add("This ");
    696 
    697         generalIteratorTest(titleBreak, titleData);
    698     }
    699 
    700 
    701 
    702     /*
    703      * @bug 4153072
    704      */
    705     @Test
    706     public void TestBug4153072() {
    707         BreakIterator iter = BreakIterator.getWordInstance();
    708         String str = "...Hello, World!...";
    709         int begin = 3;
    710         int end = str.length() - 3;
    711         // not used boolean gotException = false;
    712 
    713 
    714         iter.setText(new StringCharacterIterator(str, begin, end, begin));
    715         for (int index = -1; index < begin + 1; ++index) {
    716             try {
    717                 iter.isBoundary(index);
    718                 if (index < begin)
    719                     errln("Didn't get exception with offset = " + index +
    720                                     " and begin index = " + begin);
    721             }
    722             catch (IllegalArgumentException e) {
    723                 if (index >= begin)
    724                     errln("Got exception with offset = " + index +
    725                                     " and begin index = " + begin);
    726             }
    727         }
    728     }
    729 
    730 
    731     @Test
    732     public void TestBug4146175Lines() {
    733         List<String> lineSelectionData = new ArrayList<String>(2);
    734 
    735         // the fullwidth comma should stick to the preceding Japanese character
    736         lineSelectionData.add("\u7d42\uff0c");
    737         lineSelectionData.add("\u308f");
    738 
    739         generalIteratorTest(lineBreak, lineSelectionData);
    740     }
    741 
    742     private static final String cannedTestChars
    743         = "\u0000\u0001\u0002\u0003\u0004 !\"#$%&()+-01234<=>ABCDE[]^_`abcde{}|\u00a0\u00a2"
    744         + "\u00a3\u00a4\u00a5\u00a6\u00a7\u00a8\u00a9\u00ab\u00ad\u00ae\u00af\u00b0\u00b2\u00b3"
    745         + "\u00b4\u00b9\u00bb\u00bc\u00bd\u02b0\u02b1\u02b2\u02b3\u02b4\u0300\u0301\u0302\u0303"
    746         + "\u0304\u05d0\u05d1\u05d2\u05d3\u05d4\u0903\u093e\u093f\u0940\u0949\u0f3a\u0f3b\u2000"
    747         + "\u2001\u2002\u200c\u200d\u200e\u200f\u2010\u2011\u2012\u2028\u2029\u202a\u203e\u203f"
    748         + "\u2040\u20dd\u20de\u20df\u20e0\u2160\u2161\u2162\u2163\u2164";
    749 
    750     @Test
    751     public void TestSentenceInvariants()
    752     {
    753         BreakIterator e = BreakIterator.getSentenceInstance();
    754         doOtherInvariantTest(e, cannedTestChars + ".,\u3001\u3002\u3041\u3042\u3043\ufeff");
    755     }
    756 
    757     @Test
    758     public void TestEmptyString()
    759     {
    760         String text = "";
    761         List<String> x = new ArrayList<String>(1);
    762         x.add(text);
    763 
    764         generalIteratorTest(lineBreak, x);
    765     }
    766 
    767     @Test
    768     public void TestGetAvailableLocales()
    769     {
    770         Locale[] locList = BreakIterator.getAvailableLocales();
    771 
    772         if (locList.length == 0)
    773             errln("getAvailableLocales() returned an empty list!");
    774         // I have no idea how to test this function...
    775 
    776         com.ibm.icu.util.ULocale[] ulocList = BreakIterator.getAvailableULocales();
    777         if (ulocList.length == 0) {
    778             errln("getAvailableULocales() returned an empty list!");
    779         } else {
    780             logln("getAvailableULocales() returned " + ulocList.length + " locales");
    781         }
    782     }
    783 
    784 
    785     /**
    786      * @bug 4068137
    787      */
    788     @Test
    789     public void TestEndBehavior()
    790     {
    791         String testString = "boo.";
    792         BreakIterator wb = BreakIterator.getWordInstance();
    793         wb.setText(testString);
    794 
    795         if (wb.first() != 0)
    796             errln("Didn't get break at beginning of string.");
    797         if (wb.next() != 3)
    798             errln("Didn't get break before period in \"boo.\"");
    799         if (wb.current() != 4 && wb.next() != 4)
    800             errln("Didn't get break at end of string.");
    801     }
    802 
    803     // The Following two tests are ported from ICU4C 1.8.1 [Richard/GCL]
    804     /**
    805      * Port From:   ICU4C v1.8.1 : textbounds : IntlTestTextBoundary
    806      * Source File: $ICU4CRoot/source/test/intltest/ittxtbd.cpp
    807      **/
    808     /**
    809      * test methods preceding, following and isBoundary
    810      **/
    811     @Test
    812     public void TestPreceding() {
    813         String words3 = "aaa bbb ccc";
    814         BreakIterator e = BreakIterator.getWordInstance(Locale.getDefault());
    815         e.setText( words3 );
    816         e.first();
    817         int p1 = e.next();
    818         int p2 = e.next();
    819         int p3 = e.next();
    820         int p4 = e.next();
    821 
    822         int f = e.following(p2+1);
    823         int p = e.preceding(p2+1);
    824         if (f!=p3)
    825             errln("IntlTestTextBoundary::TestPreceding: f!=p3");
    826         if (p!=p2)
    827             errln("IntlTestTextBoundary::TestPreceding: p!=p2");
    828 
    829         if (p1+1!=p2)
    830             errln("IntlTestTextBoundary::TestPreceding: p1+1!=p2");
    831 
    832         if (p3+1!=p4)
    833             errln("IntlTestTextBoundary::TestPreceding: p3+1!=p4");
    834 
    835         if (!e.isBoundary(p2) || e.isBoundary(p2+1) || !e.isBoundary(p3))
    836         {
    837             errln("IntlTestTextBoundary::TestPreceding: isBoundary err");
    838         }
    839     }
    840 
    841 
    842     /**
    843      * Bug 4450804
    844      */
    845     @Test
    846     public void TestLineBreakContractions() {
    847         List<String> expected = new ArrayList<String>(7);
    848         expected.add("These ");
    849         expected.add("are ");
    850         expected.add("'foobles'. ");
    851         expected.add("Don't ");
    852         expected.add("you ");
    853         expected.add("like ");
    854         expected.add("them?");
    855         generalIteratorTest(lineBreak, expected);
    856     }
    857 
    858     /**
    859      * Ticket#5615
    860      */
    861     @Test
    862     public void TestT5615() {
    863         com.ibm.icu.util.ULocale[] ulocales = BreakIterator.getAvailableULocales();
    864         int type = 0;
    865         com.ibm.icu.util.ULocale loc = null;
    866         try {
    867             for (int i = 0; i < ulocales.length; i++) {
    868                 loc = ulocales[i];
    869                 for (type = 0; type < 5 /* 5 = BreakIterator.KIND_COUNT */; ++type) {
    870                     BreakIterator brk = BreakIterator.getBreakInstance(loc, type);
    871                     if (brk == null) {
    872                         errln("ERR: Failed to create an instance type: " + type + " / locale: " + loc);
    873                     }
    874                 }
    875             }
    876         } catch (Exception e) {
    877             errln("ERR: Failed to create an instance type: " + type + " / locale: " + loc + " / exception: " + e.getMessage());
    878         }
    879     }
    880 
    881     /**
    882      * At present, Japanese doesn't have exceptions.
    883      * However, this still should not fail.
    884      */
    885     @Test
    886     public void TestFilteredJapanese() {
    887         ULocale loc = ULocale.JAPANESE;
    888         BreakIterator brk = FilteredBreakIteratorBuilder
    889                 .createInstance(loc)
    890                 .build(BreakIterator.getSentenceInstance(loc));
    891         brk.setText("");
    892         assertEquals("Starting point", 0, brk.current());
    893         assertEquals("Next point", 5, brk.next());
    894         assertEquals("Last point", BreakIterator.DONE, brk.next());
    895     }
    896 
    897     /*
    898      * Test case for Ticket#10721. BreakIterator factory method should throw NPE
    899      * when specified locale is null.
    900      */
    901     @Test
    902     public void TestNullLocale() {
    903         Locale loc = null;
    904         ULocale uloc = null;
    905 
    906         @SuppressWarnings("unused")
    907         BreakIterator brk;
    908 
    909         // Character
    910         try {
    911             brk = BreakIterator.getCharacterInstance(loc);
    912             errln("getCharacterInstance((Locale)null) did not throw NPE.");
    913         } catch (NullPointerException e) { /* OK */ }
    914         try {
    915             brk = BreakIterator.getCharacterInstance(uloc);
    916             errln("getCharacterInstance((ULocale)null) did not throw NPE.");
    917         } catch (NullPointerException e) { /* OK */ }
    918 
    919         // Line
    920         try {
    921             brk = BreakIterator.getLineInstance(loc);
    922             errln("getLineInstance((Locale)null) did not throw NPE.");
    923         } catch (NullPointerException e) { /* OK */ }
    924         try {
    925             brk = BreakIterator.getLineInstance(uloc);
    926             errln("getLineInstance((ULocale)null) did not throw NPE.");
    927         } catch (NullPointerException e) { /* OK */ }
    928 
    929         // Sentence
    930         try {
    931             brk = BreakIterator.getSentenceInstance(loc);
    932             errln("getSentenceInstance((Locale)null) did not throw NPE.");
    933         } catch (NullPointerException e) { /* OK */ }
    934         try {
    935             brk = BreakIterator.getSentenceInstance(uloc);
    936             errln("getSentenceInstance((ULocale)null) did not throw NPE.");
    937         } catch (NullPointerException e) { /* OK */ }
    938 
    939         // Title
    940         try {
    941             brk = BreakIterator.getTitleInstance(loc);
    942             errln("getTitleInstance((Locale)null) did not throw NPE.");
    943         } catch (NullPointerException e) { /* OK */ }
    944         try {
    945             brk = BreakIterator.getTitleInstance(uloc);
    946             errln("getTitleInstance((ULocale)null) did not throw NPE.");
    947         } catch (NullPointerException e) { /* OK */ }
    948 
    949         // Word
    950         try {
    951             brk = BreakIterator.getWordInstance(loc);
    952             errln("getWordInstance((Locale)null) did not throw NPE.");
    953         } catch (NullPointerException e) { /* OK */ }
    954         try {
    955             brk = BreakIterator.getWordInstance(uloc);
    956             errln("getWordInstance((ULocale)null) did not throw NPE.");
    957         } catch (NullPointerException e) { /* OK */ }
    958     }
    959 
    960     /**
    961      * Test FilteredBreakIteratorBuilder newly introduced
    962      */
    963     @Test
    964     public void TestFilteredBreakIteratorBuilder() {
    965         FilteredBreakIteratorBuilder builder;
    966         BreakIterator baseBI;
    967         BreakIterator filteredBI;
    968 
    969         String text = "In the meantime Mr. Weston arrived with his small ship, which he had now recovered. Capt. Gorges, who informed the Sgt. here that one purpose of his going east was to meet with Mr. Weston, took this opportunity to call him to account for some abuses he had to lay to his charge."; // (William Bradford, public domain. http://catalog.hathitrust.org/Record/008651224 ) - edited.
    970         String ABBR_MR = "Mr.";
    971         String ABBR_CAPT = "Capt.";
    972 
    973         {
    974             logln("Constructing empty builder\n");
    975             builder = FilteredBreakIteratorBuilder.createInstance();
    976 
    977             logln("Constructing base BI\n");
    978             baseBI = BreakIterator.getSentenceInstance(Locale.ENGLISH);
    979 
    980             logln("Building new BI\n");
    981             filteredBI = builder.build(baseBI);
    982 
    983             assertDefaultBreakBehavior(filteredBI, text);
    984         }
    985 
    986         {
    987             logln("Constructing empty builder\n");
    988             builder = FilteredBreakIteratorBuilder.createInstance();
    989 
    990             logln("Adding Mr. as an exception\n");
    991 
    992             assertEquals("2.1 suppressBreakAfter", true, builder.suppressBreakAfter(ABBR_MR));
    993             assertEquals("2.2 suppressBreakAfter", false, builder.suppressBreakAfter(ABBR_MR));
    994             assertEquals("2.3 unsuppressBreakAfter", true, builder.unsuppressBreakAfter(ABBR_MR));
    995             assertEquals("2.4 unsuppressBreakAfter", false, builder.unsuppressBreakAfter(ABBR_MR));
    996             assertEquals("2.5 suppressBreakAfter", true, builder.suppressBreakAfter(ABBR_MR));
    997 
    998             logln("Constructing base BI\n");
    999             baseBI = BreakIterator.getSentenceInstance(Locale.ENGLISH);
   1000 
   1001             logln("Building new BI\n");
   1002             filteredBI = builder.build(baseBI);
   1003 
   1004             logln("Testing:");
   1005             filteredBI.setText(text);
   1006             assertEquals("2nd next", 84, filteredBI.next());
   1007             assertEquals("2nd next", 90, filteredBI.next());
   1008             assertEquals("2nd next", 278, filteredBI.next());
   1009             filteredBI.first();
   1010         }
   1011 
   1012 
   1013         {
   1014           logln("Constructing empty builder\n");
   1015           builder = FilteredBreakIteratorBuilder.createInstance();
   1016 
   1017           logln("Adding Mr. and Capt as an exception\n");
   1018           assertEquals("3.1 suppressBreakAfter", true, builder.suppressBreakAfter(ABBR_MR));
   1019           assertEquals("3.2 suppressBreakAfter", true, builder.suppressBreakAfter(ABBR_CAPT));
   1020 
   1021           logln("Constructing base BI\n");
   1022           baseBI = BreakIterator.getSentenceInstance(Locale.ENGLISH);
   1023 
   1024           logln("Building new BI\n");
   1025           filteredBI = builder.build(baseBI);
   1026 
   1027           logln("Testing:");
   1028           filteredBI.setText(text);
   1029           assertEquals("3rd next", 84, filteredBI.next());
   1030           assertEquals("3rd next", 278, filteredBI.next());
   1031           filteredBI.first();
   1032         }
   1033 
   1034         {
   1035           logln("Constructing English builder\n");
   1036           builder = FilteredBreakIteratorBuilder.createInstance(ULocale.ENGLISH);
   1037 
   1038           logln("Constructing base BI\n");
   1039           baseBI = BreakIterator.getSentenceInstance(Locale.ENGLISH);
   1040 
   1041           logln("unsuppressing 'Capt'");
   1042           assertEquals("1st suppressBreakAfter", true, builder.unsuppressBreakAfter(ABBR_CAPT));
   1043 
   1044           logln("Building new BI\n");
   1045           filteredBI = builder.build(baseBI);
   1046 
   1047           if(filteredBI != null) {
   1048             logln("Testing:");
   1049             filteredBI.setText(text);
   1050             assertEquals("4th next", 84, filteredBI.next());
   1051             assertEquals("4th next", 90, filteredBI.next());
   1052             assertEquals("4th next", 278, filteredBI.next());
   1053             filteredBI.first();
   1054           }
   1055         }
   1056 
   1057         {
   1058           logln("Constructing English builder\n");
   1059           builder = FilteredBreakIteratorBuilder.createInstance(ULocale.ENGLISH);
   1060 
   1061           logln("Constructing base BI\n");
   1062           baseBI = BreakIterator.getSentenceInstance(Locale.ENGLISH);
   1063 
   1064           logln("Building new BI\n");
   1065           filteredBI = builder.build(baseBI);
   1066 
   1067           if(filteredBI != null) {
   1068             assertEnglishBreakBehavior(filteredBI, text);
   1069           }
   1070         }
   1071 
   1072         {
   1073             logln("Constructing English @ss=standard\n");
   1074             filteredBI = BreakIterator.getSentenceInstance(ULocale.forLanguageTag("en-US-u-ss-standard"));
   1075 
   1076             if(filteredBI != null) {
   1077               assertEnglishBreakBehavior(filteredBI, text);
   1078             }
   1079         }
   1080 
   1081         {
   1082             logln("Constructing Afrikaans @ss=standard - should be == default\n");
   1083             filteredBI = BreakIterator.getSentenceInstance(ULocale.forLanguageTag("af-u-ss-standard"));
   1084 
   1085             assertDefaultBreakBehavior(filteredBI, text);
   1086         }
   1087 
   1088         {
   1089             logln("Constructing Japanese @ss=standard - should be == default\n");
   1090             filteredBI = BreakIterator.getSentenceInstance(ULocale.forLanguageTag("ja-u-ss-standard"));
   1091 
   1092             assertDefaultBreakBehavior(filteredBI, text);
   1093         }
   1094         {
   1095             logln("Constructing tfg @ss=standard - should be == default\n");
   1096             filteredBI = BreakIterator.getSentenceInstance(ULocale.forLanguageTag("tfg-u-ss-standard"));
   1097 
   1098             assertDefaultBreakBehavior(filteredBI, text);
   1099         }
   1100 
   1101         {
   1102           logln("Constructing French builder");
   1103           builder = FilteredBreakIteratorBuilder.createInstance(ULocale.FRENCH);
   1104 
   1105           logln("Constructing base BI\n");
   1106           baseBI = BreakIterator.getSentenceInstance(Locale.FRENCH);
   1107 
   1108           logln("Building new BI\n");
   1109           filteredBI = builder.build(baseBI);
   1110 
   1111           if(filteredBI != null) {
   1112             assertFrenchBreakBehavior(filteredBI, text);
   1113           }
   1114         }
   1115     }
   1116 
   1117     /**
   1118      * @param filteredBI
   1119      * @param text
   1120      */
   1121     private void assertFrenchBreakBehavior(BreakIterator filteredBI, String text) {
   1122         logln("Testing French behavior:");
   1123         filteredBI.setText(text);
   1124         assertEquals("6th next", 20, filteredBI.next());
   1125         assertEquals("6th next", 84, filteredBI.next());
   1126         filteredBI.first();
   1127     }
   1128 
   1129     /**
   1130      * @param filteredBI
   1131      * @param text
   1132      */
   1133     private void assertEnglishBreakBehavior(BreakIterator filteredBI, String text) {
   1134         logln("Testing English filtered behavior:");
   1135           filteredBI.setText(text);
   1136 
   1137           assertEquals("5th next", 84, filteredBI.next());
   1138           assertEquals("5th next", 278, filteredBI.next());
   1139           filteredBI.first();
   1140     }
   1141 
   1142     /**
   1143      * @param filteredBI
   1144      * @param text
   1145      */
   1146     private void assertDefaultBreakBehavior(BreakIterator filteredBI, String text) {
   1147         logln("Testing Default Behavior:");
   1148         filteredBI.setText(text);
   1149         assertEquals("1st next", 20, filteredBI.next());
   1150         assertEquals("1st next", 84, filteredBI.next());
   1151         assertEquals("1st next", 90, filteredBI.next());
   1152         assertEquals("1st next", 181, filteredBI.next());
   1153         assertEquals("1st next", 278, filteredBI.next());
   1154         filteredBI.first();
   1155     }
   1156 }
   1157