Home | History | Annotate | Download | only in rbbi
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4  *******************************************************************************
      5  * Copyright (C) 2003-2016 International Business Machines Corporation and
      6  * others. All Rights Reserved.
      7  *******************************************************************************
      8  */
      9 package com.ibm.icu.dev.test.rbbi;
     10 
     11 
     12 // Monkey testing of RuleBasedBreakIterator
     13 import java.util.ArrayList;
     14 import java.util.Arrays;
     15 import java.util.List;
     16 import java.util.Locale;
     17 
     18 import org.junit.Test;
     19 
     20 import com.ibm.icu.dev.test.TestFmwk;
     21 import com.ibm.icu.lang.UCharacter;
     22 import com.ibm.icu.lang.UProperty;
     23 import com.ibm.icu.text.BreakIterator;
     24 import com.ibm.icu.text.RuleBasedBreakIterator;
     25 import com.ibm.icu.text.UTF16;
     26 import com.ibm.icu.text.UnicodeSet;
     27 
     28 
     29 /**
     30  * Monkey tests for RBBI.  These tests have independent implementations of
     31  * the Unicode TR boundary rules, and compare results between these and ICU's
     32  * implementation, using random data.
     33  *
     34  * Tests cover Grapheme Cluster (char), Word and Line breaks
     35  *
     36  * Ported from ICU4C, original code in file source/test/intltest/rbbitst.cpp
     37  *
     38  */
     39 public class RBBITestMonkey extends TestFmwk {
     40     //
     41     //     class RBBIMonkeyKind
     42     //
     43     //        Monkey Test for Break Iteration
     44     //        Abstract interface class.   Concrete derived classes independently
     45     //        implement the break rules for different iterator types.
     46     //
     47     //        The Monkey Test itself uses doesn't know which type of break iterator it is
     48     //        testing, but works purely in terms of the interface defined here.
     49     //
     50     abstract static class RBBIMonkeyKind {
     51 
     52         // Return a List of UnicodeSets, representing the character classes used
     53         //   for this type of iterator.
     54         abstract  List  charClasses();
     55 
     56         // Set the test text on which subsequent calls to next() will operate
     57         abstract  void   setText(StringBuffer text);
     58 
     59         // Find the next break position, starting from the specified position.
     60         // Return -1 after reaching end of string.
     61         abstract   int   next(int i);
     62 
     63         // A Character Property, one of the constants defined in class UProperty.
     64         //   The value of this property will be displayed for the characters
     65         //    near any test failure.
     66         int   fCharProperty;
     67     }
     68 
     69     //
     70     // Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
     71     //
     72     static String gExtended_Pict = "[" +
     73             "\\U0001F774-\\U0001F77F\\u2700-\\u2701\\u2703-\\u2704\\u270E\\u2710-\\u2711\\u2765-\\u2767\\U0001F030-\\U0001F093" +
     74             "\\U0001F094-\\U0001F09F\\U0001F10D-\\U0001F10F\\U0001F12F\\U0001F16C-\\U0001F16F\\U0001F1AD-\\U0001F1E5" +
     75             "\\U0001F203-\\U0001F20F\\U0001F23C-\\U0001F23F\\U0001F249-\\U0001F24F\\U0001F252-\\U0001F2FF\\U0001F7D5-\\U0001F7FF" +
     76             "\\U0001F000-\\U0001F003\\U0001F005-\\U0001F02B\\U0001F02C-\\U0001F02F\\U0001F322-\\U0001F323\\U0001F394-\\U0001F395" +
     77             "\\U0001F398\\U0001F39C-\\U0001F39D\\U0001F3F1-\\U0001F3F2\\U0001F3F6\\U0001F4FE\\U0001F53E-\\U0001F548" +
     78             "\\U0001F54F\\U0001F568-\\U0001F56E\\U0001F571-\\U0001F572\\U0001F57B-\\U0001F586\\U0001F588-\\U0001F589" +
     79             "\\U0001F58E-\\U0001F58F\\U0001F591-\\U0001F594\\U0001F597-\\U0001F5A3\\U0001F5A6-\\U0001F5A7\\U0001F5A9-\\U0001F5B0" +
     80             "\\U0001F5B3-\\U0001F5BB\\U0001F5BD-\\U0001F5C1\\U0001F5C5-\\U0001F5D0\\U0001F5D4-\\U0001F5DB\\U0001F5DF-\\U0001F5E0" +
     81             "\\U0001F5E2\\U0001F5E4-\\U0001F5E7\\U0001F5E9-\\U0001F5EE\\U0001F5F0-\\U0001F5F2\\U0001F5F4-\\U0001F5F9" +
     82             "\\u2605\\u2607-\\u260D\\u260F-\\u2610\\u2612\\u2616-\\u2617\\u2619-\\u261C\\u261E-\\u261F\\u2621\\u2624-\\u2625" +
     83             "\\u2627-\\u2629\\u262B-\\u262D\\u2630-\\u2637\\u263B-\\u2647\\u2654-\\u265F\\u2661-\\u2662\\u2664\\u2667" +
     84             "\\u2669-\\u267A\\u267C-\\u267E\\u2680-\\u2691\\u2695\\u2698\\u269A\\u269D-\\u269F\\u26A2-\\u26A9\\u26AC-\\u26AF" +
     85             "\\u26B2-\\u26BC\\u26BF-\\u26C3\\u26C6-\\u26C7\\u26C9-\\u26CD\\u26D0\\u26D2\\u26D5-\\u26E8\\u26EB-\\u26EF" +
     86             "\\u26F6\\u26FB-\\u26FC\\u26FE-\\u26FF\\u2388\\U0001FA00-\\U0001FFFD\\U0001F0A0-\\U0001F0AE\\U0001F0B1-\\U0001F0BF" +
     87             "\\U0001F0C1-\\U0001F0CF\\U0001F0D1-\\U0001F0F5\\U0001F0AF-\\U0001F0B0\\U0001F0C0\\U0001F0D0\\U0001F0F6-\\U0001F0FF" +
     88             "\\U0001F80C-\\U0001F80F\\U0001F848-\\U0001F84F\\U0001F85A-\\U0001F85F\\U0001F888-\\U0001F88F\\U0001F8AE-\\U0001F8FF" +
     89             "\\U0001F900-\\U0001F90F\\U0001F91F\\U0001F928-\\U0001F92F\\U0001F931-\\U0001F932\\U0001F93F\\U0001F94C-\\U0001F94F" +
     90             "\\U0001F95F-\\U0001F97F\\U0001F992-\\U0001F9BF\\U0001F9C1-\\U0001F9FF\\U0001F6C6-\\U0001F6CA\\U0001F6E6-\\U0001F6E8" +
     91             "\\U0001F6EA\\U0001F6F1-\\U0001F6F2\\U0001F6D3-\\U0001F6DF\\U0001F6ED-\\U0001F6EF\\U0001F6F7-\\U0001F6FF" +
     92             "]";
     93 
     94 
     95     /**
     96      * Monkey test subclass for testing Character (Grapheme Cluster) boundaries.
     97      * Note: As of Unicode 6.1, fPrependSet is empty, so don't add it to fSets
     98      */
     99     static class RBBICharMonkey extends RBBIMonkeyKind {
    100         List                      fSets;
    101 
    102         UnicodeSet                fCRLFSet;
    103         UnicodeSet                fControlSet;
    104         UnicodeSet                fExtendSet;
    105         UnicodeSet                fRegionalIndicatorSet;
    106         UnicodeSet                fPrependSet;
    107         UnicodeSet                fSpacingSet;
    108         UnicodeSet                fLSet;
    109         UnicodeSet                fVSet;
    110         UnicodeSet                fTSet;
    111         UnicodeSet                fLVSet;
    112         UnicodeSet                fLVTSet;
    113         UnicodeSet                fHangulSet;
    114         UnicodeSet                fEmojiModifierSet;
    115         UnicodeSet                fEmojiBaseSet;
    116         UnicodeSet                fZWJSet;
    117         UnicodeSet                fExtendedPictSet;
    118         UnicodeSet                fEBGSet;
    119         UnicodeSet                fEmojiNRKSet;
    120         UnicodeSet                fAnySet;
    121 
    122 
    123         StringBuffer              fText;
    124 
    125 
    126         RBBICharMonkey() {
    127             fText       = null;
    128             fCharProperty = UProperty.GRAPHEME_CLUSTER_BREAK;
    129             fCRLFSet    = new UnicodeSet("[\\r\\n]");
    130             fControlSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Control}]");
    131             fExtendSet  = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}]");
    132             fZWJSet     = new UnicodeSet("[\\p{Grapheme_Cluster_Break = ZWJ}]");
    133             fRegionalIndicatorSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]");
    134             fPrependSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Prepend}]");
    135             fSpacingSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = SpacingMark}]");
    136             fLSet       = new UnicodeSet("[\\p{Grapheme_Cluster_Break = L}]");
    137             fVSet       = new UnicodeSet("[\\p{Grapheme_Cluster_Break = V}]");
    138             fTSet       = new UnicodeSet("[\\p{Grapheme_Cluster_Break = T}]");
    139             fLVSet      = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LV}]");
    140             fLVTSet     = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LVT}]");
    141             fHangulSet  = new UnicodeSet();
    142             fHangulSet.addAll(fLSet);
    143             fHangulSet.addAll(fVSet);
    144             fHangulSet.addAll(fTSet);
    145             fHangulSet.addAll(fLVSet);
    146             fHangulSet.addAll(fLVTSet);
    147 
    148             fEmojiBaseSet     = new UnicodeSet("[\\p{Grapheme_Cluster_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]");
    149             fEmojiModifierSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = EM}]");
    150             fExtendedPictSet  = new UnicodeSet(gExtended_Pict);
    151             fEBGSet           = new UnicodeSet("[\\p{Grapheme_Cluster_Break = EBG}]");
    152             fEmojiNRKSet      = new UnicodeSet("[[\\p{Emoji}]-[\\p{Grapheme_Cluster_Break = Regional_Indicator}*#0-9]]");
    153             fAnySet           = new UnicodeSet("[\\u0000-\\U0010ffff]");
    154 
    155 
    156             fSets       = new ArrayList();
    157             fSets.add(fCRLFSet);
    158             fSets.add(fControlSet);
    159             fSets.add(fExtendSet);
    160             fSets.add(fRegionalIndicatorSet);
    161             if (!fPrependSet.isEmpty()) {
    162                 fSets.add(fPrependSet);
    163             }
    164             fSets.add(fSpacingSet);
    165             fSets.add(fHangulSet);
    166             fSets.add(fAnySet);
    167             fSets.add(fEmojiBaseSet);
    168             fSets.add(fEmojiModifierSet);
    169             fSets.add(fZWJSet);
    170             fSets.add(fExtendedPictSet);
    171             fSets.add(fEBGSet);
    172             fSets.add(fEmojiNRKSet);
    173         }
    174 
    175 
    176         @Override
    177         void setText(StringBuffer s) {
    178             fText = s;
    179         }
    180 
    181         @Override
    182         List charClasses() {
    183             return fSets;
    184         }
    185 
    186         @Override
    187         int next(int prevPos) {
    188             int    /*p0,*/ p1, p2, p3;    // Indices of the significant code points around the
    189             //   break position being tested.  The candidate break
    190             //   location is before p2.
    191 
    192             int     breakPos = -1;
    193 
    194             int   c0, c1, c2, c3;     // The code points at p0, p1, p2 & p3.
    195             int   cBase;              // for (X Extend*) patterns, the X character.
    196 
    197             // Previous break at end of string.  return DONE.
    198             if (prevPos >= fText.length()) {
    199                 return -1;
    200             }
    201             /* p0 = */ p1 = p2 = p3 = prevPos;
    202             c3 =  UTF16.charAt(fText, prevPos);
    203             c0 = c1 = c2 = cBase = 0;
    204 
    205             // Loop runs once per "significant" character position in the input text.
    206             for (;;) {
    207                 // Move all of the positions forward in the input string.
    208                 /* p0 = p1;*/  c0 = c1;
    209                 p1 = p2;  c1 = c2;
    210                 p2 = p3;  c2 = c3;
    211 
    212                 // Advance p3 by one codepoint
    213                 p3 = moveIndex32(fText, p3, 1);
    214                 c3 = (p3>=fText.length())? -1: UTF16.charAt(fText, p3);
    215 
    216                 if (p1 == p2) {
    217                     // Still warming up the loop.  (won't work with zero length strings, but we don't care)
    218                     continue;
    219                 }
    220                 if (p2 == fText.length()) {
    221                     // Reached end of string.  Always a break position.
    222                     break;
    223                 }
    224 
    225                 // Rule  GB3   CR x LF
    226                 //     No Extend or Format characters may appear between the CR and LF,
    227                 //     which requires the additional check for p2 immediately following p1.
    228                 //
    229                 if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
    230                     continue;
    231                 }
    232 
    233                 // Rule (GB4).   ( Control | CR | LF ) <break>
    234                 if (fControlSet.contains(c1) ||
    235                         c1 == 0x0D ||
    236                         c1 == 0x0A)  {
    237                     break;
    238                 }
    239 
    240                 // Rule (GB5)    <break>  ( Control | CR | LF )
    241                 //
    242                 if (fControlSet.contains(c2) ||
    243                         c2 == 0x0D ||
    244                         c2 == 0x0A)  {
    245                     break;
    246                 }
    247 
    248 
    249                 // Rule (GB6)  L x ( L | V | LV | LVT )
    250                 if (fLSet.contains(c1) &&
    251                         (fLSet.contains(c2)  ||
    252                                 fVSet.contains(c2)  ||
    253                                 fLVSet.contains(c2) ||
    254                                 fLVTSet.contains(c2))) {
    255                     continue;
    256                 }
    257 
    258                 // Rule (GB7)    ( LV | V )  x  ( V | T )
    259                 if ((fLVSet.contains(c1) || fVSet.contains(c1)) &&
    260                         (fVSet.contains(c2) || fTSet.contains(c2)))  {
    261                     continue;
    262                 }
    263 
    264                 // Rule (GB8)    ( LVT | T)  x T
    265                 if ((fLVTSet.contains(c1) || fTSet.contains(c1)) &&
    266                         fTSet.contains(c2))  {
    267                     continue;
    268                 }
    269 
    270                 // Rule (GB9)    x (Extend | ZWJ)
    271                 if (fExtendSet.contains(c2) || fZWJSet.contains(c2))  {
    272                     if (!fExtendSet.contains(c1)) {
    273                         cBase = c1;
    274                     }
    275                     continue;
    276                 }
    277 
    278                 // Rule (GB9a)   x  SpacingMark
    279                 if (fSpacingSet.contains(c2)) {
    280                     continue;
    281                 }
    282 
    283                 // Rule (GB9b)   Prepend x
    284                 if (fPrependSet.contains(c1)) {
    285                     continue;
    286                 }
    287                 // Rule (GB10)   (Emoji_Base | EBG) Extend* x Emoji_Modifier
    288                 if ((fEmojiBaseSet.contains(c1) || fEBGSet.contains(c1)) && fEmojiModifierSet.contains(c2)) {
    289                     continue;
    290                 }
    291                 if ((fEmojiBaseSet.contains(cBase) || fEBGSet.contains(cBase)) &&
    292                         fExtendSet.contains(c1) && fEmojiModifierSet.contains(c2)) {
    293                     continue;
    294                 }
    295 
    296                 // Rule (GB11)   (Extended_Pictographic | Emoji) ZWJ x (Extended_Pictographic | Emoji)
    297                 if ((fExtendedPictSet.contains(c0) || fEmojiNRKSet.contains(c0)) && fZWJSet.contains(c1) &&
    298                         (fExtendedPictSet.contains(c2) || fEmojiNRKSet.contains(c2))) {
    299                     continue;
    300                 }
    301 
    302                 // Rule (GB12-13)   Regional_Indicator x Regional_Indicator
    303                 //                  Note: The first if condition is a little tricky. We only need to force
    304                 //                      a break if there are three or more contiguous RIs. If there are
    305                 //                      only two, a break following will occur via other rules, and will include
    306                 //                      any trailing extend characters, which is needed behavior.
    307                 if (fRegionalIndicatorSet.contains(c0) && fRegionalIndicatorSet.contains(c1)
    308                         && fRegionalIndicatorSet.contains(c2)) {
    309                     break;
    310                 }
    311                 if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) {
    312                     continue;
    313                 }
    314 
    315                 // Rule (GB999)  Any  <break>  Any
    316                 break;
    317             }
    318 
    319             breakPos = p2;
    320             return breakPos;
    321         }
    322     }
    323 
    324 
    325     /**
    326      *
    327      * Word Monkey Test Class
    328      *
    329      *
    330      *
    331      */
    332     static class RBBIWordMonkey extends RBBIMonkeyKind {
    333         List                      fSets;
    334         StringBuffer              fText;
    335 
    336         UnicodeSet                fCRSet;
    337         UnicodeSet                fLFSet;
    338         UnicodeSet                fNewlineSet;
    339         UnicodeSet                fRegionalIndicatorSet;
    340         UnicodeSet                fKatakanaSet;
    341         UnicodeSet                fHebrew_LetterSet;
    342         UnicodeSet                fALetterSet;
    343         UnicodeSet                fSingle_QuoteSet;
    344         UnicodeSet                fDouble_QuoteSet;
    345         UnicodeSet                fMidNumLetSet;
    346         UnicodeSet                fMidLetterSet;
    347         UnicodeSet                fMidNumSet;
    348         UnicodeSet                fNumericSet;
    349         UnicodeSet                fFormatSet;
    350         UnicodeSet                fExtendSet;
    351         UnicodeSet                fExtendNumLetSet;
    352         UnicodeSet                fOtherSet;
    353         UnicodeSet                fDictionarySet;
    354         UnicodeSet                fEBaseSet;
    355         UnicodeSet                fEBGSet;
    356         UnicodeSet                fEModifierSet;
    357         UnicodeSet                fZWJSet;
    358         UnicodeSet                fExtendedPictSet;
    359         UnicodeSet                fEmojiNRKSet;
    360 
    361 
    362         RBBIWordMonkey() {
    363             fCharProperty    = UProperty.WORD_BREAK;
    364 
    365             fCRSet           = new UnicodeSet("[\\p{Word_Break = CR}]");
    366             fLFSet           = new UnicodeSet("[\\p{Word_Break = LF}]");
    367             fNewlineSet      = new UnicodeSet("[\\p{Word_Break = Newline}]");
    368             fRegionalIndicatorSet = new UnicodeSet("[\\p{Word_Break = Regional_Indicator}]");
    369             fKatakanaSet     = new UnicodeSet("[\\p{Word_Break = Katakana}]");
    370             fHebrew_LetterSet = new UnicodeSet("[\\p{Word_Break = Hebrew_Letter}]");
    371             fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}]");
    372             fSingle_QuoteSet = new UnicodeSet("[\\p{Word_Break = Single_Quote}]");
    373             fDouble_QuoteSet = new UnicodeSet("[\\p{Word_Break = Double_Quote}]");
    374             fMidNumLetSet    = new UnicodeSet("[\\p{Word_Break = MidNumLet}]");
    375             fMidLetterSet    = new UnicodeSet("[\\p{Word_Break = MidLetter}]");
    376             fMidNumSet       = new UnicodeSet("[\\p{Word_Break = MidNum}]");
    377             fNumericSet      = new UnicodeSet("[\\p{Word_Break = Numeric}]");
    378             fFormatSet       = new UnicodeSet("[\\p{Word_Break = Format}]");
    379             fExtendNumLetSet = new UnicodeSet("[\\p{Word_Break = ExtendNumLet}]");
    380             fExtendSet       = new UnicodeSet("[\\p{Word_Break = Extend}]");
    381             fEBaseSet        = new UnicodeSet("[\\p{Word_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]");
    382             fEBGSet          = new UnicodeSet("[\\p{Word_Break = EBG}]");
    383             fEModifierSet    = new UnicodeSet("[\\p{Word_Break = EM}]");
    384             fZWJSet          = new UnicodeSet("[\\p{Word_Break = ZWJ}]");
    385             fExtendedPictSet = new UnicodeSet(gExtended_Pict);
    386             fEmojiNRKSet     = new UnicodeSet("[[\\p{Emoji}]-[\\p{Grapheme_Cluster_Break = Regional_Indicator}*#0-9]]");
    387 
    388             fDictionarySet = new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]");
    389             fDictionarySet.addAll(fKatakanaSet);
    390             fDictionarySet.addAll(new UnicodeSet("[\\p{LineBreak = Complex_Context}]"));
    391 
    392             fALetterSet.removeAll(fDictionarySet);
    393 
    394             fOtherSet        = new UnicodeSet();
    395             fOtherSet.complement();
    396             fOtherSet.removeAll(fCRSet);
    397             fOtherSet.removeAll(fLFSet);
    398             fOtherSet.removeAll(fNewlineSet);
    399             fOtherSet.removeAll(fALetterSet);
    400             fOtherSet.removeAll(fSingle_QuoteSet);
    401             fOtherSet.removeAll(fDouble_QuoteSet);
    402             fOtherSet.removeAll(fKatakanaSet);
    403             fOtherSet.removeAll(fHebrew_LetterSet);
    404             fOtherSet.removeAll(fMidLetterSet);
    405             fOtherSet.removeAll(fMidNumSet);
    406             fOtherSet.removeAll(fNumericSet);
    407             fOtherSet.removeAll(fFormatSet);
    408             fOtherSet.removeAll(fExtendSet);
    409             fOtherSet.removeAll(fExtendNumLetSet);
    410             fOtherSet.removeAll(fRegionalIndicatorSet);
    411             fOtherSet.removeAll(fEBaseSet);
    412             fOtherSet.removeAll(fEBGSet);
    413             fOtherSet.removeAll(fEModifierSet);
    414             fOtherSet.removeAll(fZWJSet);
    415             fOtherSet.removeAll(fExtendedPictSet);
    416             fOtherSet.removeAll(fEmojiNRKSet);
    417 
    418             // Inhibit dictionary characters from being tested at all.
    419             // remove surrogates so as to not generate higher CJK characters
    420             fOtherSet.removeAll(new UnicodeSet("[[\\p{LineBreak = Complex_Context}][:Line_Break=Surrogate:]]"));
    421             fOtherSet.removeAll(fDictionarySet);
    422 
    423             fSets            = new ArrayList();
    424             fSets.add(fCRSet);
    425             fSets.add(fLFSet);
    426             fSets.add(fNewlineSet);
    427             fSets.add(fRegionalIndicatorSet);
    428             fSets.add(fHebrew_LetterSet);
    429             fSets.add(fALetterSet);
    430             //fSets.add(fKatakanaSet);  // Omit Katakana from fSets, which omits Katakana characters
    431             // from the test data. They are all in the dictionary set,
    432             // which this (old, to be retired) monkey test cannot handle.
    433             fSets.add(fSingle_QuoteSet);
    434             fSets.add(fDouble_QuoteSet);
    435             fSets.add(fMidLetterSet);
    436             fSets.add(fMidNumLetSet);
    437             fSets.add(fMidNumSet);
    438             fSets.add(fNumericSet);
    439             fSets.add(fFormatSet);
    440             fSets.add(fExtendSet);
    441             fSets.add(fExtendNumLetSet);
    442             fSets.add(fRegionalIndicatorSet);
    443             fSets.add(fEBaseSet);
    444             fSets.add(fEBGSet);
    445             fSets.add(fEModifierSet);
    446             fSets.add(fZWJSet);
    447             fSets.add(fExtendedPictSet);
    448             fSets.add(fEmojiNRKSet);
    449             fSets.add(fOtherSet);
    450         }
    451 
    452 
    453         @Override
    454         List  charClasses() {
    455             return fSets;
    456         }
    457 
    458         @Override
    459         void   setText(StringBuffer s) {
    460             fText = s;
    461         }
    462 
    463         @Override
    464         int   next(int prevPos) {
    465             int    /*p0,*/ p1, p2, p3;      // Indices of the significant code points around the
    466             //   break position being tested.  The candidate break
    467             //   location is before p2.
    468             int     breakPos = -1;
    469 
    470             int c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
    471 
    472             // Previous break at end of string.  return DONE.
    473             if (prevPos >= fText.length()) {
    474                 return -1;
    475             }
    476             /*p0 =*/ p1 = p2 = p3 = prevPos;
    477             c3 = UTF16.charAt(fText, prevPos);
    478             c0 = c1 = c2 = 0;
    479 
    480 
    481 
    482             // Loop runs once per "significant" character position in the input text.
    483             for (;;) {
    484                 // Move all of the positions forward in the input string.
    485                 /*p0 = p1;*/  c0 = c1;
    486                 p1 = p2;  c1 = c2;
    487                 p2 = p3;  c2 = c3;
    488 
    489                 // Advance p3 by    X(Extend | Format)*   Rule 4
    490                 //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
    491                 do {
    492                     p3 = moveIndex32(fText, p3, 1);
    493                     c3 = -1;
    494                     if (p3>=fText.length()) {
    495                         break;
    496                     }
    497                     c3 = UTF16.charAt(fText, p3);
    498                     if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) {
    499                         break;
    500                     }
    501                 }
    502                 while (setContains(fFormatSet, c3) || setContains(fExtendSet, c3) || setContains(fZWJSet, c3));
    503 
    504                 if (p1 == p2) {
    505                     // Still warming up the loop.  (won't work with zero length strings, but we don't care)
    506                     continue;
    507                 }
    508                 if (p2 == fText.length()) {
    509                     // Reached end of string.  Always a break position.
    510                     break;
    511                 }
    512 
    513                 // Rule (3)   CR x LF
    514                 //     No Extend or Format characters may appear between the CR and LF,
    515                 //     which requires the additional check for p2 immediately following p1.
    516                 //
    517                 if (c1==0x0D && c2==0x0A) {
    518                     continue;
    519                 }
    520 
    521                 // Rule (3a)  Break before and after newlines (including CR and LF)
    522                 //
    523                 if (fCRSet.contains(c1) || fLFSet.contains(c1) || fNewlineSet.contains(c1)) {
    524                     break;
    525                 }
    526                 if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) {
    527                     break;
    528                 }
    529 
    530                 // Rule (3c)    ZWJ x (Extended_Pictographic | Emoji).
    531                 //              Not ignoring extend chars, so peek into input text to
    532                 //              get the potential ZWJ, the character immediately preceding c2.
    533                 if (fZWJSet.contains(fText.codePointBefore(p2)) && (fExtendedPictSet.contains(c2) || fEmojiNRKSet.contains(c2))) {
    534                     continue;
    535                 }
    536 
    537                 // Rule (5).   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
    538                 if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) &&
    539                         (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2)))  {
    540                     continue;
    541                 }
    542 
    543                 // Rule (6)  (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
    544                 //
    545                 if ( (fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1))   &&
    546                         (fMidLetterSet.contains(c2) || fMidNumLetSet.contains(c2) || fSingle_QuoteSet.contains(c2)) &&
    547                         (setContains(fALetterSet, c3) || setContains(fHebrew_LetterSet, c3))) {
    548                     continue;
    549                 }
    550 
    551                 // Rule (7)  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)
    552                 if ((fALetterSet.contains(c0) || fHebrew_LetterSet.contains(c0)) &&
    553                         (fMidLetterSet.contains(c1) || fMidNumLetSet.contains(c1) || fSingle_QuoteSet.contains(c1)) &&
    554                         (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2))) {
    555                     continue;
    556                 }
    557 
    558                 // Rule (7a)     Hebrew_Letter x Single_Quote
    559                 if (fHebrew_LetterSet.contains(c1) && fSingle_QuoteSet.contains(c2)) {
    560                     continue;
    561                 }
    562 
    563                 // Rule (7b)    Hebrew_Letter x Double_Quote Hebrew_Letter
    564                 if (fHebrew_LetterSet.contains(c1) && fDouble_QuoteSet.contains(c2) && setContains(fHebrew_LetterSet,c3)) {
    565                     continue;
    566                 }
    567 
    568                 // Rule (7c)    Hebrew_Letter Double_Quote x Hebrew_Letter
    569                 if (fHebrew_LetterSet.contains(c0) && fDouble_QuoteSet.contains(c1) && fHebrew_LetterSet.contains(c2)) {
    570                     continue;
    571                 }
    572 
    573                 //  Rule (8)    Numeric x Numeric
    574                 if (fNumericSet.contains(c1) &&
    575                         fNumericSet.contains(c2))  {
    576                     continue;
    577                 }
    578 
    579                 // Rule (9)    (ALetter | Hebrew_Letter) x Numeric
    580                 if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) &&
    581                         fNumericSet.contains(c2))  {
    582                     continue;
    583                 }
    584 
    585                 // Rule (10)    Numeric x (ALetter | Hebrew_Letter)
    586                 if (fNumericSet.contains(c1) &&
    587                         (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2)))  {
    588                     continue;
    589                 }
    590 
    591                 // Rule (11)   Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric
    592                 if (fNumericSet.contains(c0) &&
    593                         (fMidNumSet.contains(c1) || fMidNumLetSet.contains(c1) || fSingle_QuoteSet.contains(c1))  &&
    594                         fNumericSet.contains(c2)) {
    595                     continue;
    596                 }
    597 
    598                 // Rule (12)  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
    599                 if (fNumericSet.contains(c1) &&
    600                         (fMidNumSet.contains(c2) || fMidNumLetSet.contains(c2) || fSingle_QuoteSet.contains(c2))  &&
    601                         setContains(fNumericSet, c3)) {
    602                     continue;
    603                 }
    604 
    605                 // Rule (13)  Katakana x Katakana
    606                 //            Note: matches UAX 29 rules, but doesn't come into play for ICU because
    607                 //                  all Katakana are handled by the dictionary breaker.
    608                 if (fKatakanaSet.contains(c1) &&
    609                         fKatakanaSet.contains(c2))  {
    610                     continue;
    611                 }
    612 
    613                 // Rule 13a    (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
    614                 if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1) ||fNumericSet.contains(c1) ||
    615                         fKatakanaSet.contains(c1) || fExtendNumLetSet.contains(c1)) &&
    616                         fExtendNumLetSet.contains(c2)) {
    617                     continue;
    618                 }
    619 
    620                 // Rule 13b   ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
    621                 if (fExtendNumLetSet.contains(c1) &&
    622                         (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2) ||
    623                                 fNumericSet.contains(c2) || fKatakanaSet.contains(c2)))  {
    624                     continue;
    625                 }
    626 
    627 
    628                 // Rule 14 (E_Base | EBG) x E_Modifier
    629                 if ((fEBaseSet.contains(c1)  || fEBGSet.contains(c1)) && fEModifierSet.contains(c2)) {
    630                     continue;
    631                 }
    632 
    633                 // Rule 15 - 17   Group piars of Regional Indicators
    634                 if (fRegionalIndicatorSet.contains(c0) && fRegionalIndicatorSet.contains(c1)) {
    635                     break;
    636                 }
    637                 if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) {
    638                     continue;
    639                 }
    640 
    641                 // Rule 999.  Break found here.
    642                 break;
    643             }
    644 
    645             breakPos = p2;
    646             return breakPos;
    647         }
    648 
    649     }
    650 
    651 
    652     static class RBBILineMonkey extends RBBIMonkeyKind {
    653 
    654         List        fSets;
    655 
    656         // UnicodeSets for each of the Line Breaking character classes.
    657         // Order matches that of Unicode UAX 14, Table 1, which makes it a little easier
    658         // to verify that they are all accounted for.
    659 
    660         UnicodeSet  fBK;
    661         UnicodeSet  fCR;
    662         UnicodeSet  fLF;
    663         UnicodeSet  fCM;
    664         UnicodeSet  fNL;
    665         UnicodeSet  fSG;
    666         UnicodeSet  fWJ;
    667         UnicodeSet  fZW;
    668         UnicodeSet  fGL;
    669         UnicodeSet  fSP;
    670         UnicodeSet  fB2;
    671         UnicodeSet  fBA;
    672         UnicodeSet  fBB;
    673         UnicodeSet  fHY;
    674         UnicodeSet  fCB;
    675         UnicodeSet  fCL;
    676         UnicodeSet  fCP;
    677         UnicodeSet  fEX;
    678         UnicodeSet  fIN;
    679         UnicodeSet  fNS;
    680         UnicodeSet  fOP;
    681         UnicodeSet  fQU;
    682         UnicodeSet  fIS;
    683         UnicodeSet  fNU;
    684         UnicodeSet  fPO;
    685         UnicodeSet  fPR;
    686         UnicodeSet  fSY;
    687         UnicodeSet  fAI;
    688         UnicodeSet  fAL;
    689         UnicodeSet  fCJ;
    690         UnicodeSet  fH2;
    691         UnicodeSet  fH3;
    692         UnicodeSet  fHL;
    693         UnicodeSet  fID;
    694         UnicodeSet  fJL;
    695         UnicodeSet  fJV;
    696         UnicodeSet  fJT;
    697         UnicodeSet  fRI;
    698         UnicodeSet  fXX;
    699         UnicodeSet  fEB;
    700         UnicodeSet  fEM;
    701         UnicodeSet  fZWJ;
    702         UnicodeSet  fExtendedPict;
    703         UnicodeSet  fEmojiNRK;
    704 
    705         StringBuffer  fText;
    706         int           fOrigPositions;
    707 
    708 
    709 
    710         RBBILineMonkey()
    711         {
    712             fCharProperty  = UProperty.LINE_BREAK;
    713             fSets          = new ArrayList();
    714 
    715             fBK    = new UnicodeSet("[\\p{Line_Break=BK}]");
    716             fCR    = new UnicodeSet("[\\p{Line_break=CR}]");
    717             fLF    = new UnicodeSet("[\\p{Line_break=LF}]");
    718             fCM    = new UnicodeSet("[\\p{Line_break=CM}]");
    719             fNL    = new UnicodeSet("[\\p{Line_break=NL}]");
    720             fSG    = new UnicodeSet("[\\ud800-\\udfff]");
    721             fWJ    = new UnicodeSet("[\\p{Line_break=WJ}]");
    722             fZW    = new UnicodeSet("[\\p{Line_break=ZW}]");
    723             fGL    = new UnicodeSet("[\\p{Line_break=GL}]");
    724             fSP    = new UnicodeSet("[\\p{Line_break=SP}]");
    725             fB2    = new UnicodeSet("[\\p{Line_break=B2}]");
    726             fBA    = new UnicodeSet("[\\p{Line_break=BA}]");
    727             fBB    = new UnicodeSet("[\\p{Line_break=BB}]");
    728             fHY    = new UnicodeSet("[\\p{Line_break=HY}]");
    729             fCB    = new UnicodeSet("[\\p{Line_break=CB}]");
    730             fCL    = new UnicodeSet("[\\p{Line_break=CL}]");
    731             fCP    = new UnicodeSet("[\\p{Line_break=CP}]");
    732             fEX    = new UnicodeSet("[\\p{Line_break=EX}]");
    733             fIN    = new UnicodeSet("[\\p{Line_break=IN}]");
    734             fNS    = new UnicodeSet("[\\p{Line_break=NS}]");
    735             fOP    = new UnicodeSet("[\\p{Line_break=OP}]");
    736             fQU    = new UnicodeSet("[\\p{Line_break=QU}]");
    737             fIS    = new UnicodeSet("[\\p{Line_break=IS}]");
    738             fNU    = new UnicodeSet("[\\p{Line_break=NU}]");
    739             fPO    = new UnicodeSet("[\\p{Line_break=PO}]");
    740             fPR    = new UnicodeSet("[\\p{Line_break=PR}]");
    741             fSY    = new UnicodeSet("[\\p{Line_break=SY}]");
    742             fAI    = new UnicodeSet("[\\p{Line_break=AI}]");
    743             fAL    = new UnicodeSet("[\\p{Line_break=AL}]");
    744             fCJ    = new UnicodeSet("[\\p{Line_break=CJ}]");
    745             fH2    = new UnicodeSet("[\\p{Line_break=H2}]");
    746             fH3    = new UnicodeSet("[\\p{Line_break=H3}]");
    747             fHL    = new UnicodeSet("[\\p{Line_break=HL}]");
    748             fID    = new UnicodeSet("[\\p{Line_break=ID}]");
    749             fJL    = new UnicodeSet("[\\p{Line_break=JL}]");
    750             fJV    = new UnicodeSet("[\\p{Line_break=JV}]");
    751             fJT    = new UnicodeSet("[\\p{Line_break=JT}]");
    752             fRI    = new UnicodeSet("[\\p{Line_break=RI}]");
    753             fXX    = new UnicodeSet("[\\p{Line_break=XX}]");
    754             fEB    = new UnicodeSet("[\\p{Line_break=EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]");
    755             fEM    = new UnicodeSet("[\\p{Line_break=EM}]");
    756             fZWJ   = new UnicodeSet("[\\p{Line_break=ZWJ}]");
    757             fEmojiNRK = new UnicodeSet("[[\\p{Emoji}]-[\\p{Line_break=RI}*#0-9]]");
    758             fExtendedPict = new UnicodeSet(gExtended_Pict);
    759 
    760 
    761             // Remove dictionary characters.
    762             // The monkey test reference implementation of line break does not replicate the dictionary behavior,
    763             // so dictionary characters are omitted from the monkey test data.
    764             @SuppressWarnings("unused")
    765             UnicodeSet dictionarySet = new UnicodeSet(
    766                     "[[:LineBreak = Complex_Context:] & [[:Script = Thai:][:Script = Lao:][:Script = Khmer:] [:script = Myanmar:]]]");
    767 
    768             fAL.addAll(fXX);     // Default behavior for XX is identical to AL
    769             fAL.addAll(fAI);     // Default behavior for AI is identical to AL
    770             fAL.addAll(fSG);     // Default behavior for SG (unpaired surrogates) is AL
    771 
    772             fNS.addAll(fCJ);     // Default behavior for CJ is identical to NS.
    773             fCM.addAll(fZWJ);    // ZWJ behaves as a CM.
    774 
    775             fSets.add(fBK);
    776             fSets.add(fCR);
    777             fSets.add(fLF);
    778             fSets.add(fCM);
    779             fSets.add(fNL);
    780             fSets.add(fWJ);
    781             fSets.add(fZW);
    782             fSets.add(fGL);
    783             fSets.add(fSP);
    784             fSets.add(fB2);
    785             fSets.add(fBA);
    786             fSets.add(fBB);
    787             fSets.add(fHY);
    788             fSets.add(fCB);
    789             fSets.add(fCL);
    790             fSets.add(fCP);
    791             fSets.add(fEX);
    792             fSets.add(fIN);
    793             fSets.add(fJL);
    794             fSets.add(fJT);
    795             fSets.add(fJV);
    796             fSets.add(fNS);
    797             fSets.add(fOP);
    798             fSets.add(fQU);
    799             fSets.add(fIS);
    800             fSets.add(fNU);
    801             fSets.add(fPO);
    802             fSets.add(fPR);
    803             fSets.add(fSY);
    804             fSets.add(fAI);
    805             fSets.add(fAL);
    806             fSets.add(fH2);
    807             fSets.add(fH3);
    808             fSets.add(fHL);
    809             fSets.add(fID);
    810             fSets.add(fWJ);
    811             fSets.add(fRI);
    812             fSets.add(fSG);
    813             fSets.add(fEB);
    814             fSets.add(fEM);
    815             fSets.add(fZWJ);
    816             fSets.add(fExtendedPict);
    817             fSets.add(fEmojiNRK);
    818         }
    819 
    820         @Override
    821         void setText(StringBuffer s) {
    822             fText       = s;
    823         }
    824 
    825 
    826 
    827 
    828         @Override
    829         int next(int startPos) {
    830             int    pos;       //  Index of the char following a potential break position
    831             int    thisChar;  //  Character at above position "pos"
    832 
    833             int    prevPos;   //  Index of the char preceding a potential break position
    834             int    prevChar;  //  Character at above position.  Note that prevChar
    835             //   and thisChar may not be adjacent because combining
    836             //   characters between them will be ignored.
    837             int    prevCharX2; //  Character before prevChar, more contex for LB 21a
    838 
    839             int    nextPos;   //  Index of the next character following pos.
    840             //     Usually skips over combining marks.
    841             int    tPos;      //  temp value.
    842             int    matchVals[]  = null;       // Number  Expression Match Results
    843 
    844 
    845             if (startPos >= fText.length()) {
    846                 return -1;
    847             }
    848 
    849 
    850             // Initial values for loop.  Loop will run the first time without finding breaks,
    851             //                           while the invalid values shift out and the "this" and
    852             //                           "prev" positions are filled in with good values.
    853             pos      = prevPos   = -1;    // Invalid value, serves as flag for initial loop iteration.
    854             thisChar = prevChar  = prevCharX2 = 0;
    855             nextPos  = startPos;
    856 
    857 
    858             // Loop runs once per position in the test text, until a break position
    859             //  is found.  In each iteration, we are testing for a possible break
    860             //  just preceding the character at index "pos".  The character preceding
    861             //  this char is at postion "prevPos"; because of combining sequences,
    862             //  "prevPos" can be arbitrarily far before "pos".
    863             for (;;) {
    864                 // Advance to the next position to be tested.
    865                 prevCharX2 = prevChar;
    866                 prevPos   = pos;
    867                 prevChar  = thisChar;
    868                 pos       = nextPos;
    869                 nextPos   = moveIndex32(fText, pos, 1);
    870 
    871                 // Rule LB2 - Break at end of text.
    872                 if (pos >= fText.length()) {
    873                     break;
    874                 }
    875 
    876                 // Rule LB 9 - adjust for combining sequences.
    877                 //             We do this rule out-of-order because the adjustment does
    878                 //             not effect the way that rules LB 3 through LB 6 match,
    879                 //             and doing it here rather than after LB 6 is substantially
    880                 //             simpler when combining sequences do occur.
    881 
    882 
    883                 // LB 9         Keep combining sequences together.
    884                 //              advance over any CM class chars at "pos",
    885                 //              result is "nextPos" for the following loop iteration.
    886                 thisChar  = UTF16.charAt(fText, pos);
    887                 if (!(fSP.contains(thisChar) || fBK.contains(thisChar) || thisChar==0x0d ||
    888                         thisChar==0x0a || fNL.contains(thisChar) || fZW.contains(thisChar) )) {
    889                     for (;;) {
    890                         if (nextPos == fText.length()) {
    891                             break;
    892                         }
    893                         int nextChar = UTF16.charAt(fText, nextPos);
    894                         if (!fCM.contains(nextChar)) {
    895                             break;
    896                         }
    897                         nextPos = moveIndex32(fText, nextPos, 1);
    898                     }
    899                 }
    900 
    901                 // LB 9 Treat X CM* as if it were X
    902                 //        No explicit action required.
    903 
    904                 // LB 10     Treat any remaining combining mark as AL
    905                 if (fCM.contains(thisChar)) {
    906                     thisChar = 'A';
    907                 }
    908 
    909 
    910                 // If the loop is still warming up - if we haven't shifted the initial
    911                 //   -1 positions out of prevPos yet - loop back to advance the
    912                 //    position in the input without any further looking for breaks.
    913                 if (prevPos == -1) {
    914                     continue;
    915                 }
    916 
    917                 // LB 4  Always break after hard line breaks,
    918                 if (fBK.contains(prevChar)) {
    919                     break;
    920                 }
    921 
    922                 // LB 5  Break after CR, LF, NL, but not inside CR LF
    923                 if (fCR.contains(prevChar) && fLF.contains(thisChar)) {
    924                     continue;
    925                 }
    926                 if  (fCR.contains(prevChar) ||
    927                         fLF.contains(prevChar) ||
    928                         fNL.contains(prevChar))  {
    929                     break;
    930                 }
    931 
    932                 // LB 6  Don't break before hard line breaks
    933                 if (fBK.contains(thisChar) || fCR.contains(thisChar) ||
    934                         fLF.contains(thisChar) || fNL.contains(thisChar) ) {
    935                     continue;
    936                 }
    937 
    938 
    939                 // LB 7  Don't break before spaces or zero-width space.
    940                 if (fSP.contains(thisChar)) {
    941                     continue;
    942                 }
    943 
    944                 if (fZW.contains(thisChar)) {
    945                     continue;
    946                 }
    947 
    948                 // LB 8  Break after zero width space
    949                 if (fZW.contains(prevChar)) {
    950                     break;
    951                 }
    952 
    953                 // LB 8a:  ZWJ x (ID | Extended_Pictographic | Emoji)
    954                 //       The monkey test's way of ignoring combining characters doesn't work
    955                 //       for this rule. ZWJ is also a CM. Need to get the actual character
    956                 //       preceding "thisChar", not ignoring combining marks, possibly ZWJ.
    957                 {
    958                     int prevC = fText.codePointBefore(pos);
    959                     if (fZWJ.contains(prevC) && (fID.contains(thisChar) || fExtendedPict.contains(thisChar) || fEmojiNRK.contains(thisChar))) {
    960                         continue;
    961                     }
    962                 }
    963 
    964                 //  LB 9, 10  Already done, at top of loop.
    965                 //
    966 
    967 
    968                 // LB 11
    969                 //    x  WJ
    970                 //    WJ  x
    971                 if (fWJ.contains(thisChar) || fWJ.contains(prevChar)) {
    972                     continue;
    973                 }
    974 
    975 
    976                 // LB 12
    977                 //        GL x
    978                 if (fGL.contains(prevChar)) {
    979                     continue;
    980                 }
    981 
    982                 // LB 12a
    983                 //    [^SP BA HY] x GL
    984                 if (!(fSP.contains(prevChar) ||
    985                         fBA.contains(prevChar) ||
    986                         fHY.contains(prevChar)     ) && fGL.contains(thisChar)) {
    987                     continue;
    988                 }
    989 
    990 
    991 
    992                 // LB 13  Don't break before closings.
    993                 //       NU x CL, NU x CP  and NU x IS are not matched here so that they will
    994                 //       fall into LB 17 and the more general number regular expression.
    995                 //
    996                 if (!fNU.contains(prevChar) && fCL.contains(thisChar) ||
    997                         !fNU.contains(prevChar) && fCP.contains(thisChar) ||
    998                         fEX.contains(thisChar) ||
    999                         !fNU.contains(prevChar) && fIS.contains(thisChar) ||
   1000                         !fNU.contains(prevChar) && fSY.contains(thisChar))    {
   1001                     continue;
   1002                 }
   1003 
   1004                 // LB 14  Don't break after OP SP*
   1005                 //       Scan backwards, checking for this sequence.
   1006                 //       The OP char could include combining marks, so we actually check for
   1007                 //           OP CM* SP* x
   1008                 tPos = prevPos;
   1009                 if (fSP.contains(prevChar)) {
   1010                     while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
   1011                         tPos=moveIndex32(fText, tPos, -1);
   1012                     }
   1013                 }
   1014                 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
   1015                     tPos=moveIndex32(fText, tPos, -1);
   1016                 }
   1017                 if (fOP.contains(UTF16.charAt(fText, tPos))) {
   1018                     continue;
   1019                 }
   1020 
   1021                 // LB 15 Do not break within "[
   1022                 //       QU CM* SP* x OP
   1023                 if (fOP.contains(thisChar)) {
   1024                     // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
   1025                     tPos = prevPos;
   1026                     while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
   1027                         tPos = moveIndex32(fText, tPos, -1);
   1028                     }
   1029                     while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
   1030                         tPos = moveIndex32(fText, tPos, -1);
   1031                     }
   1032                     if (fQU.contains(UTF16.charAt(fText, tPos))) {
   1033                         continue;
   1034                     }
   1035                 }
   1036 
   1037                 // LB 16   (CL | CP) SP* x NS
   1038                 if (fNS.contains(thisChar)) {
   1039                     tPos = prevPos;
   1040                     while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
   1041                         tPos = moveIndex32(fText, tPos, -1);
   1042                     }
   1043                     while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
   1044                         tPos = moveIndex32(fText, tPos, -1);
   1045                     }
   1046                     if (fCL.contains(UTF16.charAt(fText, tPos)) || fCP.contains(UTF16.charAt(fText, tPos))) {
   1047                         continue;
   1048                     }
   1049                 }
   1050 
   1051 
   1052                 // LB 17        B2 SP* x B2
   1053                 if (fB2.contains(thisChar)) {
   1054                     tPos = prevPos;
   1055                     while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
   1056                         tPos = moveIndex32(fText, tPos, -1);
   1057                     }
   1058                     while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
   1059                         tPos = moveIndex32(fText, tPos, -1);
   1060                     }
   1061                     if (fB2.contains(UTF16.charAt(fText, tPos))) {
   1062                         continue;
   1063                     }
   1064                 }
   1065 
   1066                 // LB 18    break after space
   1067                 if (fSP.contains(prevChar)) {
   1068                     break;
   1069                 }
   1070 
   1071                 // LB 19
   1072                 //    x   QU
   1073                 //    QU  x
   1074                 if (fQU.contains(thisChar) || fQU.contains(prevChar)) {
   1075                     continue;
   1076                 }
   1077 
   1078                 // LB 20  Break around a CB
   1079                 if (fCB.contains(thisChar) || fCB.contains(prevChar)) {
   1080                     break;
   1081                 }
   1082 
   1083                 // LB 21
   1084                 if (fBA.contains(thisChar) ||
   1085                         fHY.contains(thisChar) ||
   1086                         fNS.contains(thisChar) ||
   1087                         fBB.contains(prevChar) )   {
   1088                     continue;
   1089                 }
   1090 
   1091                 // LB 21a, HL (HY | BA) x
   1092                 if (fHL.contains(prevCharX2) && (fHY.contains(prevChar) || fBA.contains(prevChar))) {
   1093                     continue;
   1094                 }
   1095 
   1096                 // LB 21b, SY x HL
   1097                 if (fSY.contains(prevChar) && fHL.contains(thisChar)) {
   1098                     continue;
   1099                 }
   1100 
   1101                 // LB 22
   1102                 if (fAL.contains(prevChar) && fIN.contains(thisChar) ||
   1103                         fEX.contains(prevChar) && fIN.contains(thisChar) ||
   1104                         fHL.contains(prevChar) && fIN.contains(thisChar) ||
   1105                         (fID.contains(prevChar) || fEB.contains(prevChar) || fEM.contains(prevChar)) && fIN.contains(thisChar) ||
   1106                         fIN.contains(prevChar) && fIN.contains(thisChar) ||
   1107                         fNU.contains(prevChar) && fIN.contains(thisChar) )   {
   1108                     continue;
   1109                 }
   1110 
   1111                 // LB 23    (AL | HL) x NU
   1112                 //          NU x (AL | HL)
   1113                 if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && fNU.contains(thisChar)) {
   1114                     continue;
   1115                 }
   1116                 if (fNU.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) {
   1117                     continue;
   1118                 }
   1119 
   1120                 // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
   1121                 //      PR x (ID | EB | EM)
   1122                 //     (ID | EB | EM) x PO
   1123                 if (fPR.contains(prevChar) &&
   1124                         (fID.contains(thisChar) || fEB.contains(thisChar) || fEM.contains(thisChar)))  {
   1125                     continue;
   1126                 }
   1127                 if ((fID.contains(prevChar) || fEB.contains(prevChar) || fEM.contains(prevChar)) &&
   1128                         fPO.contains(thisChar)) {
   1129                     continue;
   1130                 }
   1131 
   1132                 // LB 24  Do not break between prefix and letters or ideographs.
   1133                 //         (PR | PO) x (AL | HL)
   1134                 //         (AL | HL) x (PR | PO)
   1135                 if ((fPR.contains(prevChar) || fPO.contains(prevChar)) &&
   1136                         (fAL.contains(thisChar) || fHL.contains(thisChar))) {
   1137                     continue;
   1138                 }
   1139                 if ((fAL.contains(prevChar) || fHL.contains(prevChar)) &&
   1140                         (fPR.contains(thisChar) || fPO.contains(thisChar))) {
   1141                     continue;
   1142                 }
   1143 
   1144 
   1145                 // LB 25    Numbers
   1146                 matchVals = LBNumberCheck(fText, prevPos, matchVals);
   1147                 if (matchVals[0] != -1) {
   1148                     // Matched a number.  But could have been just a single digit, which would
   1149                     //    not represent a "no break here" between prevChar and thisChar
   1150                     int numEndIdx = matchVals[1];  // idx of first char following num
   1151                     if (numEndIdx > pos) {
   1152                         // Number match includes at least the two chars being checked
   1153                         if (numEndIdx > nextPos) {
   1154                             // Number match includes additional chars.  Update pos and nextPos
   1155                             //   so that next loop iteration will continue at the end of the number,
   1156                             //   checking for breaks between last char in number & whatever follows.
   1157                             nextPos = numEndIdx;
   1158                             pos     = numEndIdx;
   1159                             do {
   1160                                 pos = moveIndex32(fText, pos, -1);
   1161                                 thisChar = UTF16.charAt(fText, pos);
   1162                             }
   1163                             while (fCM.contains(thisChar));
   1164                         }
   1165                         continue;
   1166                     }
   1167                 }
   1168 
   1169 
   1170                 // LB 26  Do not break Korean Syllables
   1171                 if (fJL.contains(prevChar) && (fJL.contains(thisChar) ||
   1172                         fJV.contains(thisChar) ||
   1173                         fH2.contains(thisChar) ||
   1174                         fH3.contains(thisChar))) {
   1175                     continue;
   1176                 }
   1177 
   1178                 if ((fJV.contains(prevChar) || fH2.contains(prevChar))  &&
   1179                         (fJV.contains(thisChar) || fJT.contains(thisChar))) {
   1180                     continue;
   1181                 }
   1182 
   1183                 if ((fJT.contains(prevChar) || fH3.contains(prevChar)) &&
   1184                         fJT.contains(thisChar)) {
   1185                     continue;
   1186                 }
   1187 
   1188                 // LB 27 Treat a Korean Syllable Block the same as ID
   1189                 if ((fJL.contains(prevChar) || fJV.contains(prevChar) ||
   1190                         fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) &&
   1191                         fIN.contains(thisChar)) {
   1192                     continue;
   1193                 }
   1194                 if ((fJL.contains(prevChar) || fJV.contains(prevChar) ||
   1195                         fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) &&
   1196                         fPO.contains(thisChar)) {
   1197                     continue;
   1198                 }
   1199                 if (fPR.contains(prevChar) && (fJL.contains(thisChar) || fJV.contains(thisChar) ||
   1200                         fJT.contains(thisChar) || fH2.contains(thisChar) || fH3.contains(thisChar))) {
   1201                     continue;
   1202                 }
   1203 
   1204 
   1205 
   1206                 // LB 28 Do not break between alphabetics
   1207                 if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && (fAL.contains(thisChar) || fHL.contains(thisChar))) {
   1208                     continue;
   1209                 }
   1210 
   1211                 // LB 29  Do not break between numeric punctuation and alphabetics
   1212                 if (fIS.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) {
   1213                     continue;
   1214                 }
   1215 
   1216                 // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
   1217                 //          (AL | NU) x OP
   1218                 //          CP x (AL | NU)
   1219                 if ((fAL.contains(prevChar) || fHL.contains(prevChar) || fNU.contains(prevChar)) && fOP.contains(thisChar)) {
   1220                     continue;
   1221                 }
   1222                 if (fCP.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar) || fNU.contains(thisChar))) {
   1223                     continue;
   1224                 }
   1225 
   1226                 // LB 30a   Break between pairs of Regional Indicators.
   1227                 //             RI RI <break> RI
   1228                 //             RI    x    RI
   1229                 if (fRI.contains(prevCharX2) && fRI.contains(prevChar) && fRI.contains(thisChar)) {
   1230                     break;
   1231                 }
   1232                 if (fRI.contains(prevChar) && fRI.contains(thisChar)) {
   1233                     continue;
   1234                 }
   1235 
   1236                 // LB30b    Emoji Base x Emoji Modifier
   1237                 if (fEB.contains(prevChar) && fEM.contains(thisChar)) {
   1238                     continue;
   1239                 }
   1240                 // LB 31    Break everywhere else
   1241                 break;
   1242             }
   1243 
   1244             return pos;
   1245         }
   1246 
   1247 
   1248 
   1249         // Match the following regular expression in the input text.
   1250         //    ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * ((CL | CP) CM*)?  (PR | PO) CM*)?
   1251         //      0    0   1       3    3    4              7    7    7    7      9    9    9     11   11    (match states)
   1252         //  retVals array  [0]  index of the start of the match, or -1 if no match
   1253         //                 [1]  index of first char following the match.
   1254         //  Can not use Java regex because need supplementary character support,
   1255         //     and because Unicode char properties version must be the same as in
   1256         //     the version of ICU being tested.
   1257         private int[] LBNumberCheck(StringBuffer s, int startIdx, int[] retVals) {
   1258             if (retVals == null) {
   1259                 retVals = new int[2];
   1260             }
   1261             retVals[0]     = -1;  // Indicates no match.
   1262             int matchState = 0;
   1263             int idx        = startIdx;
   1264 
   1265             matchLoop: for (idx = startIdx; idx<s.length(); idx = moveIndex32(s, idx, 1)){
   1266                 int c = UTF16.charAt(s, idx);
   1267                 int cLBType = UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK);
   1268                 switch (matchState) {
   1269                 case 0:
   1270                     if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC ||
   1271                     cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
   1272                         matchState = 1;
   1273                         break;
   1274                     }
   1275                     if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {
   1276                         matchState = 4;
   1277                         break;
   1278                     }
   1279                     if (cLBType == UCharacter.LineBreak.HYPHEN) {
   1280                         matchState = 4;
   1281                         break;
   1282                     }
   1283                     if (cLBType == UCharacter.LineBreak.NUMERIC) {
   1284                         matchState = 7;
   1285                         break;
   1286                     }
   1287                     break matchLoop;   /* No Match  */
   1288 
   1289                 case 1:
   1290                     if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) {
   1291                         matchState = 1;
   1292                         break;
   1293                     }
   1294                     if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {
   1295                         matchState = 4;
   1296                         break;
   1297                     }
   1298                     if (cLBType == UCharacter.LineBreak.HYPHEN) {
   1299                         matchState = 4;
   1300                         break;
   1301                     }
   1302                     if (cLBType == UCharacter.LineBreak.NUMERIC) {
   1303                         matchState = 7;
   1304                         break;
   1305                     }
   1306                     break matchLoop;   /* No Match  */
   1307 
   1308 
   1309                 case 4:
   1310                     if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) {
   1311                         matchState = 4;
   1312                         break;
   1313                     }
   1314                     if (cLBType == UCharacter.LineBreak.NUMERIC) {
   1315                         matchState = 7;
   1316                         break;
   1317                     }
   1318                     break matchLoop;   /* No Match  */
   1319                     //    ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)?  (PR | PO) CM*)?
   1320                     //      0    0   1       3    3    4              7    7    7    7      9   9     11   11    (match states)
   1321 
   1322                 case 7:
   1323                     if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) {
   1324                         matchState = 7;
   1325                         break;
   1326                     }
   1327                     if (cLBType == UCharacter.LineBreak.NUMERIC) {
   1328                         matchState = 7;
   1329                         break;
   1330                     }
   1331                     if (cLBType == UCharacter.LineBreak.INFIX_NUMERIC) {
   1332                         matchState = 7;
   1333                         break;
   1334                     }
   1335                     if (cLBType == UCharacter.LineBreak.BREAK_SYMBOLS) {
   1336                         matchState = 7;
   1337                         break;
   1338                     }
   1339                     if (cLBType == UCharacter.LineBreak.CLOSE_PUNCTUATION) {
   1340                         matchState = 9;
   1341                         break;
   1342                     }
   1343                     if (cLBType == UCharacter.LineBreak.CLOSE_PARENTHESIS) {
   1344                         matchState = 9;
   1345                         break;
   1346                     }
   1347                     if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
   1348                         matchState = 11;
   1349                         break;
   1350                     }
   1351                     if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {
   1352                         matchState = 11;
   1353                         break;
   1354                     }
   1355 
   1356                     break matchLoop;    // Match Complete.
   1357                 case 9:
   1358                     if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) {
   1359                         matchState = 9;
   1360                         break;
   1361                     }
   1362                     if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
   1363                         matchState = 11;
   1364                         break;
   1365                     }
   1366                     if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {
   1367                         matchState = 11;
   1368                         break;
   1369                     }
   1370                     break matchLoop;    // Match Complete.
   1371                 case 11:
   1372                     if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) {
   1373                         matchState = 11;
   1374                         break;
   1375                     }
   1376                     break matchLoop;    // Match Complete.
   1377                 }
   1378             }
   1379             if (matchState > 4) {
   1380                 retVals[0] = startIdx;
   1381                 retVals[1] = idx;
   1382             }
   1383             return retVals;
   1384         }
   1385 
   1386 
   1387         @Override
   1388         List  charClasses() {
   1389             return fSets;
   1390         }
   1391 
   1392 
   1393 
   1394     }
   1395 
   1396 
   1397     /**
   1398      *
   1399      * Sentence Monkey Test Class
   1400      *
   1401      *
   1402      *
   1403      */
   1404     static class RBBISentenceMonkey extends RBBIMonkeyKind {
   1405         List                 fSets;
   1406         StringBuffer         fText;
   1407 
   1408         UnicodeSet           fSepSet;
   1409         UnicodeSet           fFormatSet;
   1410         UnicodeSet           fSpSet;
   1411         UnicodeSet           fLowerSet;
   1412         UnicodeSet           fUpperSet;
   1413         UnicodeSet           fOLetterSet;
   1414         UnicodeSet           fNumericSet;
   1415         UnicodeSet           fATermSet;
   1416         UnicodeSet           fSContinueSet;
   1417         UnicodeSet           fSTermSet;
   1418         UnicodeSet           fCloseSet;
   1419         UnicodeSet           fOtherSet;
   1420         UnicodeSet           fExtendSet;
   1421 
   1422 
   1423 
   1424         RBBISentenceMonkey() {
   1425             fCharProperty  = UProperty.SENTENCE_BREAK;
   1426 
   1427             fSets            = new ArrayList();
   1428 
   1429             //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
   1430             //                       set and made into character classes of their own.  For the monkey impl,
   1431             //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
   1432             fSepSet          = new UnicodeSet("[\\p{Sentence_Break = Sep} \\u000a \\u000d]");
   1433             fFormatSet       = new UnicodeSet("[\\p{Sentence_Break = Format}]");
   1434             fSpSet           = new UnicodeSet("[\\p{Sentence_Break = Sp}]");
   1435             fLowerSet        = new UnicodeSet("[\\p{Sentence_Break = Lower}]");
   1436             fUpperSet        = new UnicodeSet("[\\p{Sentence_Break = Upper}]");
   1437             fOLetterSet      = new UnicodeSet("[\\p{Sentence_Break = OLetter}]");
   1438             fNumericSet      = new UnicodeSet("[\\p{Sentence_Break = Numeric}]");
   1439             fATermSet        = new UnicodeSet("[\\p{Sentence_Break = ATerm}]");
   1440             fSContinueSet    = new UnicodeSet("[\\p{Sentence_Break = SContinue}]");
   1441             fSTermSet        = new UnicodeSet("[\\p{Sentence_Break = STerm}]");
   1442             fCloseSet        = new UnicodeSet("[\\p{Sentence_Break = Close}]");
   1443             fExtendSet       = new UnicodeSet("[\\p{Sentence_Break = Extend}]");
   1444             fOtherSet        = new UnicodeSet();
   1445 
   1446 
   1447             fOtherSet.complement();
   1448             fOtherSet.removeAll(fSepSet);
   1449             fOtherSet.removeAll(fFormatSet);
   1450             fOtherSet.removeAll(fSpSet);
   1451             fOtherSet.removeAll(fLowerSet);
   1452             fOtherSet.removeAll(fUpperSet);
   1453             fOtherSet.removeAll(fOLetterSet);
   1454             fOtherSet.removeAll(fNumericSet);
   1455             fOtherSet.removeAll(fATermSet);
   1456             fOtherSet.removeAll(fSContinueSet);
   1457             fOtherSet.removeAll(fSTermSet);
   1458             fOtherSet.removeAll(fCloseSet);
   1459             fOtherSet.removeAll(fExtendSet);
   1460 
   1461             fSets.add(fSepSet);
   1462             fSets.add(fFormatSet);
   1463 
   1464             fSets.add(fSpSet);
   1465             fSets.add(fLowerSet);
   1466             fSets.add(fUpperSet);
   1467             fSets.add(fOLetterSet);
   1468             fSets.add(fNumericSet);
   1469             fSets.add(fATermSet);
   1470             fSets.add(fSContinueSet);
   1471             fSets.add(fSTermSet);
   1472             fSets.add(fCloseSet);
   1473             fSets.add(fOtherSet);
   1474             fSets.add(fExtendSet);
   1475         }
   1476 
   1477 
   1478         @Override
   1479         List  charClasses() {
   1480             return fSets;
   1481         }
   1482 
   1483         @Override
   1484         void   setText(StringBuffer s) {
   1485             fText = s;
   1486         }
   1487 
   1488 
   1489         //      moveBack()   Find the "significant" code point preceding the index i.
   1490         //      Skips over ($Extend | $Format)*
   1491         //
   1492         private int moveBack(int i) {
   1493 
   1494             if (i <= 0) {
   1495                 return -1;
   1496             }
   1497 
   1498             int      c;
   1499             int      j = i;
   1500             do {
   1501                 j = moveIndex32(fText, j, -1);
   1502                 c = UTF16.charAt(fText, j);
   1503             }
   1504             while (j>0 &&(fFormatSet.contains(c) || fExtendSet.contains(c)));
   1505             return j;
   1506         }
   1507 
   1508 
   1509         int moveForward(int i) {
   1510             if (i>=fText.length()) {
   1511                 return fText.length();
   1512             }
   1513             int   c;
   1514             int   j = i;
   1515             do {
   1516                 j = moveIndex32(fText, j, 1);
   1517                 c = cAt(j);
   1518             }
   1519             while (c>=0 && (fFormatSet.contains(c) || fExtendSet.contains(c)));
   1520             return j;
   1521 
   1522         }
   1523 
   1524         int cAt(int pos) {
   1525             if (pos<0 || pos>=fText.length()) {
   1526                 return -1;
   1527             }
   1528             return UTF16.charAt(fText, pos);
   1529         }
   1530 
   1531         @Override
   1532         int   next(int prevPos) {
   1533             int    /*p0,*/ p1, p2, p3;      // Indices of the significant code points around the
   1534             //   break position being tested.  The candidate break
   1535             //   location is before p2.
   1536             int     breakPos = -1;
   1537 
   1538             int c0, c1, c2, c3;         // The code points at p0, p1, p2 & p3.
   1539             int c;
   1540 
   1541             // Prev break at end of string.  return DONE.
   1542             if (prevPos >= fText.length()) {
   1543                 return -1;
   1544             }
   1545             /*p0 =*/ p1 = p2 = p3 = prevPos;
   1546             c3 = UTF16.charAt(fText, prevPos);
   1547             c0 = c1 = c2 = 0;
   1548 
   1549             // Loop runs once per "significant" character position in the input text.
   1550             for (;;) {
   1551                 // Move all of the positions forward in the input string.
   1552                 /*p0 = p1;*/  c0 = c1;
   1553                 p1 = p2;  c1 = c2;
   1554                 p2 = p3;  c2 = c3;
   1555 
   1556                 // Advancd p3 by  X(Extend | Format)*   Rule 4
   1557                 p3 = moveForward(p3);
   1558                 c3 = cAt(p3);
   1559 
   1560                 // Rule (3) CR x LF
   1561                 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
   1562                     continue;
   1563                 }
   1564 
   1565                 // Rule (4)    Sep  <break>
   1566                 if (fSepSet.contains(c1)) {
   1567                     p2 = p1+1;   // Separators don't combine with Extend or Format
   1568                     break;
   1569                 }
   1570 
   1571                 if (p2 >= fText.length()) {
   1572                     // Reached end of string.  Always a break position.
   1573                     break;
   1574                 }
   1575 
   1576                 if (p2 == prevPos) {
   1577                     // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   1578                     continue;
   1579                 }
   1580 
   1581                 // Rule (6).   ATerm x Numeric
   1582                 if (fATermSet.contains(c1) &&  fNumericSet.contains(c2))  {
   1583                     continue;
   1584                 }
   1585 
   1586                 // Rule (7).  (Upper | Lower) ATerm  x  Uppper
   1587                 if ((fUpperSet.contains(c0) || fLowerSet.contains(c0)) &&
   1588                         fATermSet.contains(c1) && fUpperSet.contains(c2)) {
   1589                     continue;
   1590                 }
   1591 
   1592                 // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep))* Lower
   1593                 //           Note:  Sterm | ATerm are added to the negated part of the expression by a
   1594                 //                  note to the Unicode 5.0 documents.
   1595                 int p8 = p1;
   1596                 while (p8>0 && fSpSet.contains(cAt(p8))) {
   1597                     p8 = moveBack(p8);
   1598                 }
   1599                 while (p8>0 && fCloseSet.contains(cAt(p8))) {
   1600                     p8 = moveBack(p8);
   1601                 }
   1602                 if (fATermSet.contains(cAt(p8))) {
   1603                     p8=p2;
   1604                     for (;;) {
   1605                         c = cAt(p8);
   1606                         if (c==-1 || fOLetterSet.contains(c) || fUpperSet.contains(c) ||
   1607                                 fLowerSet.contains(c) || fSepSet.contains(c) ||
   1608                                 fATermSet.contains(c) || fSTermSet.contains(c))
   1609                         {
   1610                             break;
   1611                         }
   1612                         p8 = moveForward(p8);
   1613                     }
   1614                     if (p8<fText.length() && fLowerSet.contains(cAt(p8))) {
   1615                         continue;
   1616                     }
   1617                 }
   1618 
   1619                 // Rule 8a  (STerm | ATerm) Close* Sp* x (SContinue | Sterm | ATerm)
   1620                 if (fSContinueSet.contains(c2) || fSTermSet.contains(c2) || fATermSet.contains(c2)) {
   1621                     p8 = p1;
   1622                     while (setContains(fSpSet, cAt(p8))) {
   1623                         p8 = moveBack(p8);
   1624                     }
   1625                     while (setContains(fCloseSet, cAt(p8))) {
   1626                         p8 = moveBack(p8);
   1627                     }
   1628                     c = cAt(p8);
   1629                     if (setContains(fSTermSet, c) || setContains(fATermSet, c)) {
   1630                         continue;
   1631                     }
   1632                 }
   1633 
   1634 
   1635                 // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
   1636                 int p9 = p1;
   1637                 while (p9>0 && fCloseSet.contains(cAt(p9))) {
   1638                     p9 = moveBack(p9);
   1639                 }
   1640                 c = cAt(p9);
   1641                 if ((fSTermSet.contains(c) || fATermSet.contains(c))) {
   1642                     if (fCloseSet.contains(c2) || fSpSet.contains(c2) || fSepSet.contains(c2)) {
   1643                         continue;
   1644                     }
   1645                 }
   1646 
   1647                 // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
   1648                 int p10 = p1;
   1649                 while (p10>0 && fSpSet.contains(cAt(p10))) {
   1650                     p10 = moveBack(p10);
   1651                 }
   1652                 while (p10>0 && fCloseSet.contains(cAt(p10))) {
   1653                     p10 = moveBack(p10);
   1654                 }
   1655                 if (fSTermSet.contains(cAt(p10)) || fATermSet.contains(cAt(p10))) {
   1656                     if (fSpSet.contains(c2) || fSepSet.contains(c2)) {
   1657                         continue;
   1658                     }
   1659                 }
   1660 
   1661                 // Rule (11)  (STerm | ATerm) Close* Sp*   <break>
   1662                 int p11 = p1;
   1663                 if (p11>0 && fSepSet.contains(cAt(p11))) {
   1664                     p11 = moveBack(p11);
   1665                 }
   1666                 while (p11>0 && fSpSet.contains(cAt(p11))) {
   1667                     p11 = moveBack(p11);
   1668                 }
   1669                 while (p11>0 && fCloseSet.contains(cAt(p11))) {
   1670                     p11 = moveBack(p11);
   1671                 }
   1672                 if (fSTermSet.contains(cAt(p11)) || fATermSet.contains(cAt(p11))) {
   1673                     break;
   1674                 }
   1675 
   1676                 //  Rule (12)  Any x Any
   1677                 continue;
   1678             }
   1679             breakPos = p2;
   1680             return breakPos;
   1681         }
   1682 
   1683 
   1684 
   1685     }
   1686 
   1687 
   1688     /**
   1689      * Move an index into a string by n code points.
   1690      *   Similar to UTF16.moveCodePointOffset, but without the exceptions, which were
   1691      *   complicating usage.
   1692      * @param s   a Text string
   1693      * @param pos The starting code unit index into the text string
   1694      * @param amt The amount to adjust the string by.
   1695      * @return    The adjusted code unit index, pinned to the string's length, or
   1696      *            unchanged if input index was outside of the string.
   1697      */
   1698     static int moveIndex32(StringBuffer s, int pos, int amt) {
   1699         int i;
   1700         char  c;
   1701         if (amt>0) {
   1702             for (i=0; i<amt; i++) {
   1703                 if (pos >= s.length()) {
   1704                     return s.length();
   1705                 }
   1706                 c = s.charAt(pos);
   1707                 pos++;
   1708                 if (UTF16.isLeadSurrogate(c) && pos < s.length()) {
   1709                     c = s.charAt(pos);
   1710                     if (UTF16.isTrailSurrogate(c)) {
   1711                         pos++;
   1712                     }
   1713                 }
   1714             }
   1715         } else {
   1716             for (i=0; i>amt; i--) {
   1717                 if (pos <= 0) {
   1718                     return 0;
   1719                 }
   1720                 pos--;
   1721                 c = s.charAt(pos);
   1722                 if (UTF16.isTrailSurrogate(c) && pos >= 0) {
   1723                     c = s.charAt(pos);
   1724                     if (UTF16.isLeadSurrogate(c)) {
   1725                         pos--;
   1726                     }
   1727                 }
   1728             }
   1729         }
   1730         return pos;
   1731     }
   1732 
   1733     /**
   1734      * No-exceptions form of UnicodeSet.contains(c).
   1735      *    Simplifies loops that terminate with an end-of-input character value.
   1736      * @param s  A unicode set
   1737      * @param c  A code point value
   1738      * @return   true if the set contains c.
   1739      */
   1740     static boolean setContains(UnicodeSet s, int c) {
   1741         if (c<0 || c>UTF16.CODEPOINT_MAX_VALUE ) {
   1742             return false;
   1743         }
   1744         return s.contains(c);
   1745     }
   1746 
   1747 
   1748     /**
   1749      * return the index of the next code point in the input text.
   1750      * @param i the preceding index
   1751      */
   1752     static int  nextCP(StringBuffer s, int i) {
   1753         if (i == -1) {
   1754             // End of Input indication.  Continue to return end value.
   1755             return -1;
   1756         }
   1757         int  retVal = i + 1;
   1758         if (retVal > s.length()) {
   1759             return -1;
   1760         }
   1761         int  c = UTF16.charAt(s, i);
   1762         if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && UTF16.isLeadSurrogate(s.charAt(i))) {
   1763             retVal++;
   1764         }
   1765         return retVal;
   1766     }
   1767 
   1768 
   1769     /**
   1770      * random number generator.  Not using Java's built-in Randoms for two reasons:
   1771      *    1.  Using this code allows obtaining the same sequences as those from the ICU4C monkey test.
   1772      *    2.  We need to get and restore the seed from values occurring in the middle
   1773      *        of a long sequence, to more easily reproduce failing cases.
   1774      */
   1775     private static int m_seed = 1;
   1776     private static int  m_rand()
   1777     {
   1778         m_seed = m_seed * 1103515245 + 12345;
   1779         return (m_seed >>> 16) % 32768;
   1780     }
   1781 
   1782     // Helper function for formatting error output.
   1783     //   Append a string into a fixed-size field in a StringBuffer.
   1784     //   Blank-pad the string if it is shorter than the field.
   1785     //   Truncate the source string if it is too long.
   1786     //
   1787     private static void appendToBuf(StringBuffer dest, String src, int fieldLen) {
   1788         int appendLen = src.length();
   1789         if (appendLen >= fieldLen) {
   1790             dest.append(src.substring(0, fieldLen));
   1791         } else {
   1792             dest.append(src);
   1793             while (appendLen < fieldLen) {
   1794                 dest.append(' ');
   1795                 appendLen++;
   1796             }
   1797         }
   1798     }
   1799 
   1800     // Helper function for formatting error output.
   1801     // Display a code point in "\\uxxxx" or "\Uxxxxxxxx" format
   1802     private static void appendCharToBuf(StringBuffer dest, int c, int fieldLen) {
   1803         String hexChars = "0123456789abcdef";
   1804         if (c < 0x10000) {
   1805             dest.append("\\u");
   1806             for (int bn=12; bn>=0; bn-=4) {
   1807                 dest.append(hexChars.charAt(((c)>>bn)&0xf));
   1808             }
   1809             appendToBuf(dest, " ", fieldLen-6);
   1810         } else {
   1811             dest.append("\\U");
   1812             for (int bn=28; bn>=0; bn-=4) {
   1813                 dest.append(hexChars.charAt(((c)>>bn)&0xf));
   1814             }
   1815             appendToBuf(dest, " ", fieldLen-10);
   1816 
   1817         }
   1818     }
   1819 
   1820     /**
   1821      *  Run a RBBI monkey test.  Common routine, for all break iterator types.
   1822      *    Parameters:
   1823      *       bi      - the break iterator to use
   1824      *       mk      - MonkeyKind, abstraction for obtaining expected results
   1825      *       name    - Name of test (char, word, etc.) for use in error messages
   1826      *       seed    - Seed for starting random number generator (parameter from user)
   1827      *       numIterations
   1828      */
   1829     void RunMonkey(BreakIterator  bi, RBBIMonkeyKind mk, String name, int  seed, int numIterations) {
   1830         int              TESTSTRINGLEN = 500;
   1831         StringBuffer     testText         = new StringBuffer();
   1832         int              numCharClasses;
   1833         List             chClasses;
   1834         int[]            expected         = new int[TESTSTRINGLEN*2 + 1];
   1835         int              expectedCount    = 0;
   1836         boolean[]        expectedBreaks   = new boolean[TESTSTRINGLEN*2 + 1];
   1837         boolean[]        forwardBreaks    = new boolean[TESTSTRINGLEN*2 + 1];
   1838         boolean[]        reverseBreaks    = new boolean[TESTSTRINGLEN*2 + 1];
   1839         boolean[]        isBoundaryBreaks = new boolean[TESTSTRINGLEN*2 + 1];
   1840         boolean[]        followingBreaks  = new boolean[TESTSTRINGLEN*2 + 1];
   1841         boolean[]        precedingBreaks  = new boolean[TESTSTRINGLEN*2 + 1];
   1842         int              i;
   1843         int              loopCount        = 0;
   1844         boolean          printTestData    = false;
   1845         boolean          printBreaksFromBI = false;
   1846 
   1847         m_seed = seed;
   1848 
   1849         numCharClasses = mk.charClasses().size();
   1850         chClasses      = mk.charClasses();
   1851 
   1852         // Verify that the character classes all have at least one member.
   1853         for (i=0; i<numCharClasses; i++) {
   1854             UnicodeSet s = (UnicodeSet)chClasses.get(i);
   1855             if (s == null || s.size() == 0) {
   1856                 errln("Character Class " + i + " is null or of zero size.");
   1857                 return;
   1858             }
   1859         }
   1860 
   1861         //--------------------------------------------------------------------------------------------
   1862         //
   1863         //  Debugging settings.  Comment out everything in the following block for normal operation
   1864         //
   1865         //--------------------------------------------------------------------------------------------
   1866         // numIterations = -1;
   1867         // numIterations = 10000;   // Same as exhaustive.
   1868         // RuleBasedBreakIterator_New.fTrace = true;
   1869         // m_seed = 859056465;
   1870         // TESTSTRINGLEN = 50;
   1871         // printTestData = true;
   1872         // printBreaksFromBI = true;
   1873         // ((RuleBasedBreakIterator_New)bi).dump();
   1874 
   1875         //--------------------------------------------------------------------------------------------
   1876         //
   1877         //  End of Debugging settings.
   1878         //
   1879         //--------------------------------------------------------------------------------------------
   1880 
   1881         int  dotsOnLine = 0;
   1882         while (loopCount < numIterations || numIterations == -1) {
   1883             if (numIterations == -1 && loopCount % 10 == 0) {
   1884                 // If test is running in an infinite loop, display a periodic tic so
   1885                 //   we can tell that it is making progress.
   1886                 System.out.print(".");
   1887                 if (dotsOnLine++ >= 80){
   1888                     System.out.println();
   1889                     dotsOnLine = 0;
   1890                 }
   1891             }
   1892             // Save current random number seed, so that we can recreate the random numbers
   1893             //   for this loop iteration in event of an error.
   1894             seed = m_seed;
   1895 
   1896             testText.setLength(0);
   1897             // Populate a test string with data.
   1898             if (printTestData) {
   1899                 System.out.println("Test Data string ...");
   1900             }
   1901             for (i=0; i<TESTSTRINGLEN; i++) {
   1902                 int        aClassNum = m_rand() % numCharClasses;
   1903                 UnicodeSet classSet  = (UnicodeSet)chClasses.get(aClassNum);
   1904                 int        charIdx   = m_rand() % classSet.size();
   1905                 int        c         = classSet.charAt(charIdx);
   1906                 if (c < 0) {   // TODO:  deal with sets containing strings.
   1907                     errln("c < 0");
   1908                 }
   1909                 UTF16.appendCodePoint(testText, c);
   1910                 if (printTestData) {
   1911                     System.out.print(Integer.toHexString(c) + " ");
   1912                 }
   1913             }
   1914             if (printTestData) {
   1915                 System.out.println();
   1916             }
   1917 
   1918             Arrays.fill(expected, 0);
   1919             Arrays.fill(expectedBreaks, false);
   1920             Arrays.fill(forwardBreaks, false);
   1921             Arrays.fill(reverseBreaks, false);
   1922             Arrays.fill(isBoundaryBreaks, false);
   1923             Arrays.fill(followingBreaks, false);
   1924             Arrays.fill(precedingBreaks, false);
   1925 
   1926             // Calculate the expected results for this test string.
   1927             mk.setText(testText);
   1928             expectedCount = 0;
   1929             expectedBreaks[0] = true;
   1930             expected[expectedCount ++] = 0;
   1931             int breakPos = 0;
   1932             int lastBreakPos = -1;
   1933             for (;;) {
   1934                 lastBreakPos = breakPos;
   1935                 breakPos = mk.next(breakPos);
   1936                 if (breakPos == -1) {
   1937                     break;
   1938                 }
   1939                 if (breakPos > testText.length()) {
   1940                     errln("breakPos > testText.length()");
   1941                 }
   1942                 if (lastBreakPos >= breakPos) {
   1943                     errln("Next() not increasing.");
   1944                     // break;
   1945                 }
   1946                 expectedBreaks[breakPos] = true;
   1947                 expected[expectedCount ++] = breakPos;
   1948             }
   1949 
   1950             // Find the break positions using forward iteration
   1951             if (printBreaksFromBI) {
   1952                 System.out.println("Breaks from BI...");
   1953             }
   1954             bi.setText(testText.toString());
   1955             for (i=bi.first(); i != BreakIterator.DONE; i=bi.next()) {
   1956                 if (i < 0 || i > testText.length()) {
   1957                     errln(name + " break monkey test: Out of range value returned by breakIterator::next()");
   1958                     break;
   1959                 }
   1960                 if (printBreaksFromBI) {
   1961                     System.out.print(Integer.toHexString(i) + " ");
   1962                 }
   1963                 forwardBreaks[i] = true;
   1964             }
   1965             if (printBreaksFromBI) {
   1966                 System.out.println();
   1967             }
   1968 
   1969             // Find the break positions using reverse iteration
   1970             for (i=bi.last(); i != BreakIterator.DONE; i=bi.previous()) {
   1971                 if (i < 0 || i > testText.length()) {
   1972                     errln(name + " break monkey test: Out of range value returned by breakIterator.next()" + name);
   1973                     break;
   1974                 }
   1975                 reverseBreaks[i] = true;
   1976             }
   1977 
   1978             // Find the break positions using isBoundary() tests.
   1979             for (i=0; i<=testText.length(); i++) {
   1980                 isBoundaryBreaks[i] = bi.isBoundary(i);
   1981             }
   1982 
   1983             // Find the break positions using the following() function.
   1984             lastBreakPos = 0;
   1985             followingBreaks[0] = true;
   1986             for (i=0; i<testText.length(); i++) {
   1987                 breakPos = bi.following(i);
   1988                 if (breakPos <= i ||
   1989                         breakPos < lastBreakPos ||
   1990                         breakPos > testText.length() ||
   1991                         breakPos > lastBreakPos && lastBreakPos > i ) {
   1992                     errln(name + " break monkey test: " +
   1993                             "Out of range value returned by BreakIterator::following().\n" +
   1994                             "index=" + i + "following returned=" + breakPos +
   1995                             "lastBreak=" + lastBreakPos);
   1996                     precedingBreaks[i] = !expectedBreaks[i];   // Forces an error.
   1997                 } else {
   1998                     followingBreaks[breakPos] = true;
   1999                     lastBreakPos = breakPos;
   2000                 }
   2001             }
   2002 
   2003             // Find the break positions using the preceding() function.
   2004             lastBreakPos = testText.length();
   2005             precedingBreaks[testText.length()] = true;
   2006             for (i=testText.length(); i>0; i--) {
   2007                 breakPos = bi.preceding(i);
   2008                 if (breakPos >= i ||
   2009                         breakPos > lastBreakPos ||
   2010                         breakPos < 0 ||
   2011                         breakPos < lastBreakPos && lastBreakPos < i ) {
   2012                     errln(name + " break monkey test: " +
   2013                             "Out of range value returned by BreakIterator::preceding().\n" +
   2014                             "index=" + i + "preceding returned=" + breakPos +
   2015                             "lastBreak=" + lastBreakPos);
   2016                     precedingBreaks[i] = !expectedBreaks[i];   // Forces an error.
   2017                 } else {
   2018                     precedingBreaks[breakPos] = true;
   2019                     lastBreakPos = breakPos;
   2020                 }
   2021             }
   2022 
   2023 
   2024 
   2025             // Compare the expected and actual results.
   2026             for (i=0; i<=testText.length(); i++) {
   2027                 String errorType = null;
   2028                 if  (forwardBreaks[i] != expectedBreaks[i]) {
   2029                     errorType = "next()";
   2030                 } else if (reverseBreaks[i] != forwardBreaks[i]) {
   2031                     errorType = "previous()";
   2032                 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
   2033                     errorType = "isBoundary()";
   2034                 } else if (followingBreaks[i] != expectedBreaks[i]) {
   2035                     errorType = "following()";
   2036                 } else if (precedingBreaks[i] != expectedBreaks[i]) {
   2037                     errorType = "preceding()";
   2038                 }
   2039 
   2040                 if (errorType != null) {
   2041                     // Format a range of the test text that includes the failure as
   2042                     //  a data item that can be included in the rbbi test data file.
   2043 
   2044                     // Start of the range is the last point where expected and actual results
   2045                     //   both agreed that there was a break position.
   2046                     int startContext = i;
   2047                     int count = 0;
   2048                     for (;;) {
   2049                         if (startContext==0) { break; }
   2050                         startContext --;
   2051                         if (expectedBreaks[startContext]) {
   2052                             if (count == 2) break;
   2053                             count ++;
   2054                         }
   2055                     }
   2056 
   2057                     // End of range is two expected breaks past the start position.
   2058                     int endContext = i + 1;
   2059                     int ci;
   2060                     for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
   2061                         for (;;) {
   2062                             if (endContext >= testText.length()) {break;}
   2063                             if (expectedBreaks[endContext-1]) {
   2064                                 if (count == 0) break;
   2065                                 count --;
   2066                             }
   2067                             endContext ++;
   2068                         }
   2069                     }
   2070 
   2071                     // Format looks like   "<data><>\uabcd\uabcd<>\U0001abcd...</data>"
   2072                     StringBuffer errorText = new StringBuffer();
   2073 
   2074                     int      c;    // Char from test data
   2075                     for (ci = startContext;  ci <= endContext && ci != -1;  ci = nextCP(testText, ci)) {
   2076                         if (ci == i) {
   2077                             // This is the location of the error.
   2078                             errorText.append("<?>---------------------------------\n");
   2079                         } else if (expectedBreaks[ci]) {
   2080                             // This a non-error expected break position.
   2081                             errorText.append("------------------------------------\n");
   2082                         }
   2083                         if (ci < testText.length()) {
   2084                             c = UTF16.charAt(testText, ci);
   2085                             appendCharToBuf(errorText, c, 11);
   2086                             String gc = UCharacter.getPropertyValueName(UProperty.GENERAL_CATEGORY, UCharacter.getType(c), UProperty.NameChoice.SHORT);
   2087                             appendToBuf(errorText, gc, 8);
   2088                             int extraProp = UCharacter.getIntPropertyValue(c, mk.fCharProperty);
   2089                             String extraPropValue =
   2090                                     UCharacter.getPropertyValueName(mk.fCharProperty, extraProp, UProperty.NameChoice.LONG);
   2091                             appendToBuf(errorText, extraPropValue, 20);
   2092 
   2093                             String charName = UCharacter.getExtendedName(c);
   2094                             appendToBuf(errorText, charName, 40);
   2095                             errorText.append('\n');
   2096                         }
   2097                     }
   2098                     if (ci == testText.length() && ci != -1) {
   2099                         errorText.append("<>");
   2100                     }
   2101                     errorText.append("</data>\n");
   2102 
   2103                     // Output the error
   2104                     errln(name + " break monkey test error.  " +
   2105                             (expectedBreaks[i]? "Break expected but not found." : "Break found but not expected.") +
   2106                             "\nOperation = " + errorType + "; random seed = " + seed + ";  buf Idx = " + i + "\n" +
   2107                             errorText);
   2108                     break;
   2109                 }
   2110             }
   2111 
   2112             loopCount++;
   2113         }
   2114     }
   2115 
   2116     @Test
   2117     public void TestCharMonkey() {
   2118 
   2119         int        loopCount = 500;
   2120         int        seed      = 1;
   2121 
   2122         if (TestFmwk.getExhaustiveness() >= 9) {
   2123             loopCount = 10000;
   2124         }
   2125 
   2126         RBBICharMonkey  m = new RBBICharMonkey();
   2127         BreakIterator   bi = BreakIterator.getCharacterInstance(Locale.US);
   2128         RunMonkey(bi, m, "char", seed, loopCount);
   2129     }
   2130 
   2131     @Test
   2132     public void TestWordMonkey() {
   2133 
   2134         int        loopCount = 500;
   2135         int        seed      = 1;
   2136 
   2137         if (TestFmwk.getExhaustiveness() >= 9) {
   2138             loopCount = 10000;
   2139         }
   2140 
   2141         logln("Word Break Monkey Test");
   2142         RBBIWordMonkey  m = new RBBIWordMonkey();
   2143         BreakIterator   bi = BreakIterator.getWordInstance(Locale.US);
   2144         RunMonkey(bi, m, "word", seed, loopCount);
   2145     }
   2146 
   2147     @Test
   2148     public void TestLineMonkey() {
   2149         int        loopCount = 500;
   2150         int        seed      = 1;
   2151 
   2152         if (TestFmwk.getExhaustiveness() >= 9) {
   2153             loopCount = 10000;
   2154         }
   2155 
   2156         logln("Line Break Monkey Test");
   2157         RBBILineMonkey  m = new RBBILineMonkey();
   2158         BreakIterator   bi = BreakIterator.getLineInstance(Locale.US);
   2159         RunMonkey(bi, m, "line", seed, loopCount);
   2160     }
   2161 
   2162     @Test
   2163     public void TestSentMonkey() {
   2164 
   2165         int        loopCount = 500;
   2166         int        seed      = 1;
   2167 
   2168         if (TestFmwk.getExhaustiveness() >= 9) {
   2169             loopCount = 3000;
   2170         }
   2171 
   2172         logln("Sentence Break Monkey Test");
   2173         RBBISentenceMonkey  m = new RBBISentenceMonkey();
   2174         BreakIterator   bi = BreakIterator.getSentenceInstance(Locale.US);
   2175         RunMonkey(bi, m, "sent", seed, loopCount);
   2176     }
   2177     //
   2178     //  Round-trip monkey tests.
   2179     //  Verify that break iterators created from the rule source from the default
   2180     //    break iterators still pass the monkey test for the iterator type.
   2181     //
   2182     //  This is a major test for the Rule Compiler.  The default break iterators are built
   2183     //  from pre-compiled binary rule data that was created using ICU4C; these
   2184     //  round-trip rule recompile tests verify that the Java rule compiler can
   2185     //  rebuild break iterators from the original source rules.
   2186     //
   2187     @Test
   2188     public void TestRTCharMonkey() {
   2189 
   2190         int        loopCount = 200;
   2191         int        seed      = 1;
   2192 
   2193         if (TestFmwk.getExhaustiveness() >= 9) {
   2194             loopCount = 2000;
   2195         }
   2196 
   2197         RBBICharMonkey  m = new RBBICharMonkey();
   2198         BreakIterator   bi = BreakIterator.getCharacterInstance(Locale.US);
   2199         String rules = bi.toString();
   2200         BreakIterator rtbi = new RuleBasedBreakIterator(rules);
   2201         RunMonkey(rtbi, m, "char", seed, loopCount);
   2202     }
   2203 
   2204     @Test
   2205     public void TestRTWordMonkey() {
   2206 
   2207         int        loopCount = 200;
   2208         int        seed      = 1;
   2209 
   2210         if (TestFmwk.getExhaustiveness() >= 9) {
   2211             loopCount = 2000;
   2212         }
   2213         logln("Word Break Monkey Test");
   2214         RBBIWordMonkey  m = new RBBIWordMonkey();
   2215         BreakIterator   bi = BreakIterator.getWordInstance(Locale.US);
   2216         String rules = bi.toString();
   2217         BreakIterator rtbi = new RuleBasedBreakIterator(rules);
   2218         RunMonkey(rtbi, m, "word", seed, loopCount);
   2219     }
   2220 
   2221     @Test
   2222     public void TestRTLineMonkey() {
   2223         int        loopCount = 200;
   2224         int        seed      = 1;
   2225 
   2226         if (TestFmwk.getExhaustiveness() >= 9) {
   2227             loopCount = 2000;
   2228         }
   2229 
   2230         logln("Line Break Monkey Test");
   2231         RBBILineMonkey  m = new RBBILineMonkey();
   2232         BreakIterator   bi = BreakIterator.getLineInstance(Locale.US);
   2233         String rules = bi.toString();
   2234         BreakIterator rtbi = new RuleBasedBreakIterator(rules);
   2235         RunMonkey(rtbi, m, "line", seed, loopCount);
   2236     }
   2237 
   2238     @Test
   2239     public void TestRTSentMonkey() {
   2240 
   2241         int        loopCount = 200;
   2242         int        seed      = 1;
   2243 
   2244         if (TestFmwk.getExhaustiveness() >= 9) {
   2245             loopCount = 1000;
   2246         }
   2247 
   2248         logln("Sentence Break Monkey Test");
   2249         RBBISentenceMonkey  m = new RBBISentenceMonkey();
   2250         BreakIterator   bi = BreakIterator.getSentenceInstance(Locale.US);
   2251         String rules = bi.toString();
   2252         BreakIterator rtbi = new RuleBasedBreakIterator(rules);
   2253         RunMonkey(rtbi, m, "sent", seed, loopCount);
   2254     }
   2255 }
   2256 
   2257