Home | History | Annotate | Download | only in rbbi
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4  *******************************************************************************
      5  * Copyright (C) 2003-2016 International Business Machines Corporation and
      6  * others. All Rights Reserved.
      7  *******************************************************************************
      8  */
      9 package com.ibm.icu.dev.test.rbbi;
     10 
     11 
     12 // Monkey testing of RuleBasedBreakIterator.
     13 //    The old, original monkey test. TODO: remove
     14 //    The new monkey test is class RBBIMonkeyTest.
     15 
     16 import java.util.ArrayList;
     17 import java.util.Arrays;
     18 import java.util.List;
     19 import java.util.Locale;
     20 
     21 import org.junit.Test;
     22 import org.junit.runner.RunWith;
     23 import org.junit.runners.JUnit4;
     24 
     25 import com.ibm.icu.dev.test.TestFmwk;
     26 import com.ibm.icu.lang.UCharacter;
     27 import com.ibm.icu.lang.UProperty;
     28 import com.ibm.icu.text.BreakIterator;
     29 import com.ibm.icu.text.RuleBasedBreakIterator;
     30 import com.ibm.icu.text.UTF16;
     31 import com.ibm.icu.text.UnicodeSet;
     32 
     33 
     34 /**
     35  * Monkey tests for RBBI.  These tests have independent implementations of
     36  * the Unicode TR boundary rules, and compare results between these and ICU's
     37  * implementation, using random data.
     38  *
     39  * Tests cover Grapheme Cluster (char), Word and Line breaks
     40  *
     41  * Ported from ICU4C, original code in file source/test/intltest/rbbitst.cpp
     42  *
     43  */
     44 @RunWith(JUnit4.class)
     45 public class RBBITestMonkey extends TestFmwk {
     46     //
     47     //     class RBBIMonkeyKind
     48     //
     49     //        Monkey Test for Break Iteration
     50     //        Abstract interface class.   Concrete derived classes independently
     51     //        implement the break rules for different iterator types.
     52     //
     53     //        The Monkey Test itself uses doesn't know which type of break iterator it is
     54     //        testing, but works purely in terms of the interface defined here.
     55     //
     56     abstract static class RBBIMonkeyKind {
     57 
     58         // Return a List of UnicodeSets, representing the character classes used
     59         //   for this type of iterator.
     60         abstract  List  charClasses();
     61 
     62         // Set the test text on which subsequent calls to next() will operate
     63         abstract  void   setText(StringBuffer text);
     64 
     65         // Find the next break position, starting from the specified position.
     66         // Return -1 after reaching end of string.
     67         abstract   int   next(int i);
     68 
     69         // A Character Property, one of the constants defined in class UProperty.
     70         //   The value of this property will be displayed for the characters
     71         //    near any test failure.
     72         int   fCharProperty;
     73     }
     74 
     75     //
     76     // Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, 13267
     77     //
     78     static String gExtended_Pict = "[" +
     79             "\\U0001F774-\\U0001F77F\\U00002700-\\U00002701\\U00002703-\\U00002704\\U0000270E\\U00002710-\\U00002711\\U00002765-\\U00002767" +
     80             "\\U0001F030-\\U0001F093\\U0001F094-\\U0001F09F\\U0001F10D-\\U0001F10F\\U0001F12F\\U0001F16C-\\U0001F16F\\U0001F1AD-\\U0001F1E5" +
     81             "\\U0001F260-\\U0001F265\\U0001F203-\\U0001F20F\\U0001F23C-\\U0001F23F\\U0001F249-\\U0001F24F\\U0001F252-\\U0001F25F" +
     82             "\\U0001F266-\\U0001F2FF\\U0001F7D5-\\U0001F7FF\\U0001F000-\\U0001F003\\U0001F005-\\U0001F02B\\U0001F02C-\\U0001F02F" +
     83             "\\U0001F322-\\U0001F323\\U0001F394-\\U0001F395\\U0001F398\\U0001F39C-\\U0001F39D\\U0001F3F1-\\U0001F3F2\\U0001F3F6" +
     84             "\\U0001F4FE\\U0001F53E-\\U0001F548\\U0001F54F\\U0001F568-\\U0001F56E\\U0001F571-\\U0001F572\\U0001F57B-\\U0001F586" +
     85             "\\U0001F588-\\U0001F589\\U0001F58E-\\U0001F58F\\U0001F591-\\U0001F594\\U0001F597-\\U0001F5A3\\U0001F5A6-\\U0001F5A7" +
     86             "\\U0001F5A9-\\U0001F5B0\\U0001F5B3-\\U0001F5BB\\U0001F5BD-\\U0001F5C1\\U0001F5C5-\\U0001F5D0\\U0001F5D4-\\U0001F5DB" +
     87             "\\U0001F5DF-\\U0001F5E0\\U0001F5E2\\U0001F5E4-\\U0001F5E7\\U0001F5E9-\\U0001F5EE\\U0001F5F0-\\U0001F5F2\\U0001F5F4-\\U0001F5F9" +
     88             "\\U00002605\\U00002607-\\U0000260D\\U0000260F-\\U00002610\\U00002612\\U00002616-\\U00002617\\U00002619-\\U0000261C" +
     89             "\\U0000261E-\\U0000261F\\U00002621\\U00002624-\\U00002625\\U00002627-\\U00002629\\U0000262B-\\U0000262D\\U00002630-\\U00002637" +
     90             "\\U0000263B-\\U00002647\\U00002654-\\U0000265F\\U00002661-\\U00002662\\U00002664\\U00002667\\U00002669-\\U0000267A" +
     91             "\\U0000267C-\\U0000267E\\U00002680-\\U00002691\\U00002695\\U00002698\\U0000269A\\U0000269D-\\U0000269F\\U000026A2-\\U000026A9" +
     92             "\\U000026AC-\\U000026AF\\U000026B2-\\U000026BC\\U000026BF-\\U000026C3\\U000026C6-\\U000026C7\\U000026C9-\\U000026CD" +
     93             "\\U000026D0\\U000026D2\\U000026D5-\\U000026E8\\U000026EB-\\U000026EF\\U000026F6\\U000026FB-\\U000026FC\\U000026FE-\\U000026FF" +
     94             "\\U00002388\\U0001FA00-\\U0001FFFD\\U0001F0A0-\\U0001F0AE\\U0001F0B1-\\U0001F0BF\\U0001F0C1-\\U0001F0CF\\U0001F0D1-\\U0001F0F5" +
     95             "\\U0001F0AF-\\U0001F0B0\\U0001F0C0\\U0001F0D0\\U0001F0F6-\\U0001F0FF\\U0001F80C-\\U0001F80F\\U0001F848-\\U0001F84F" +
     96             "\\U0001F85A-\\U0001F85F\\U0001F888-\\U0001F88F\\U0001F8AE-\\U0001F8FF\\U0001F900-\\U0001F90B\\U0001F91F\\U0001F928-\\U0001F92F" +
     97             "\\U0001F931-\\U0001F932\\U0001F94C\\U0001F95F-\\U0001F96B\\U0001F992-\\U0001F997\\U0001F9D0-\\U0001F9E6\\U0001F90C-\\U0001F90F" +
     98             "\\U0001F93F\\U0001F94D-\\U0001F94F\\U0001F96C-\\U0001F97F\\U0001F998-\\U0001F9BF\\U0001F9C1-\\U0001F9CF\\U0001F9E7-\\U0001F9FF" +
     99             "\\U0001F6C6-\\U0001F6CA\\U0001F6D3-\\U0001F6D4\\U0001F6E6-\\U0001F6E8\\U0001F6EA\\U0001F6F1-\\U0001F6F2\\U0001F6F7-\\U0001F6F8" +
    100             "\\U0001F6D5-\\U0001F6DF\\U0001F6ED-\\U0001F6EF\\U0001F6F9-\\U0001F6FF" +
    101             "]";
    102 
    103 
    104     /**
    105      * Monkey test subclass for testing Character (Grapheme Cluster) boundaries.
    106      * Note: As of Unicode 6.1, fPrependSet is empty, so don't add it to fSets
    107      */
    108     static class RBBICharMonkey extends RBBIMonkeyKind {
    109         List                      fSets;
    110 
    111         UnicodeSet                fCRLFSet;
    112         UnicodeSet                fControlSet;
    113         UnicodeSet                fExtendSet;
    114         UnicodeSet                fRegionalIndicatorSet;
    115         UnicodeSet                fPrependSet;
    116         UnicodeSet                fSpacingSet;
    117         UnicodeSet                fLSet;
    118         UnicodeSet                fVSet;
    119         UnicodeSet                fTSet;
    120         UnicodeSet                fLVSet;
    121         UnicodeSet                fLVTSet;
    122         UnicodeSet                fHangulSet;
    123         UnicodeSet                fEmojiModifierSet;
    124         UnicodeSet                fEmojiBaseSet;
    125         UnicodeSet                fZWJSet;
    126         UnicodeSet                fExtendedPictSet;
    127         UnicodeSet                fEBGSet;
    128         UnicodeSet                fEmojiNRKSet;
    129         UnicodeSet                fAnySet;
    130 
    131 
    132         StringBuffer              fText;
    133 
    134 
    135         RBBICharMonkey() {
    136             fText       = null;
    137             fCharProperty = UProperty.GRAPHEME_CLUSTER_BREAK;
    138             fCRLFSet    = new UnicodeSet("[\\r\\n]");
    139             fControlSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Control}]");
    140             fExtendSet  = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}]");
    141             fZWJSet     = new UnicodeSet("[\\p{Grapheme_Cluster_Break = ZWJ}]");
    142             fRegionalIndicatorSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]");
    143             fPrependSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Prepend}]");
    144             fSpacingSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = SpacingMark}]");
    145             fLSet       = new UnicodeSet("[\\p{Grapheme_Cluster_Break = L}]");
    146             fVSet       = new UnicodeSet("[\\p{Grapheme_Cluster_Break = V}]");
    147             fTSet       = new UnicodeSet("[\\p{Grapheme_Cluster_Break = T}]");
    148             fLVSet      = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LV}]");
    149             fLVTSet     = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LVT}]");
    150             fHangulSet  = new UnicodeSet();
    151             fHangulSet.addAll(fLSet);
    152             fHangulSet.addAll(fVSet);
    153             fHangulSet.addAll(fTSet);
    154             fHangulSet.addAll(fLVSet);
    155             fHangulSet.addAll(fLVTSet);
    156 
    157             fEmojiBaseSet     = new UnicodeSet("[\\p{Grapheme_Cluster_Break = EB}]");
    158             fEmojiModifierSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = EM}]");
    159             fExtendedPictSet  = new UnicodeSet(gExtended_Pict);
    160             fEBGSet           = new UnicodeSet("[\\p{Grapheme_Cluster_Break = EBG}]");
    161             fEmojiNRKSet      = new UnicodeSet("[[\\p{Emoji}]-[\\p{Grapheme_Cluster_Break = Regional_Indicator}*#0-9]]");
    162             fAnySet           = new UnicodeSet("[\\u0000-\\U0010ffff]");
    163 
    164 
    165             fSets       = new ArrayList();
    166             fSets.add(fCRLFSet);
    167             fSets.add(fControlSet);
    168             fSets.add(fExtendSet);
    169             fSets.add(fRegionalIndicatorSet);
    170             if (!fPrependSet.isEmpty()) {
    171                 fSets.add(fPrependSet);
    172             }
    173             fSets.add(fSpacingSet);
    174             fSets.add(fHangulSet);
    175             fSets.add(fAnySet);
    176             fSets.add(fEmojiBaseSet);
    177             fSets.add(fEmojiModifierSet);
    178             fSets.add(fZWJSet);
    179             fSets.add(fExtendedPictSet);
    180             fSets.add(fEBGSet);
    181             fSets.add(fEmojiNRKSet);
    182         }
    183 
    184 
    185         @Override
    186         void setText(StringBuffer s) {
    187             fText = s;
    188         }
    189 
    190         @Override
    191         List charClasses() {
    192             return fSets;
    193         }
    194 
    195         @Override
    196         int next(int prevPos) {
    197             int    /*p0,*/ p1, p2, p3;    // Indices of the significant code points around the
    198             //   break position being tested.  The candidate break
    199             //   location is before p2.
    200 
    201             int     breakPos = -1;
    202 
    203             int   c0, c1, c2, c3;     // The code points at p0, p1, p2 & p3.
    204             int   cBase;              // for (X Extend*) patterns, the X character.
    205 
    206             // Previous break at end of string.  return DONE.
    207             if (prevPos >= fText.length()) {
    208                 return -1;
    209             }
    210             /* p0 = */ p1 = p2 = p3 = prevPos;
    211             c3 =  UTF16.charAt(fText, prevPos);
    212             c0 = c1 = c2 = cBase = 0;
    213 
    214             // Loop runs once per "significant" character position in the input text.
    215             for (;;) {
    216                 // Move all of the positions forward in the input string.
    217                 /* p0 = p1;*/  c0 = c1;
    218                 p1 = p2;  c1 = c2;
    219                 p2 = p3;  c2 = c3;
    220 
    221                 // Advance p3 by one codepoint
    222                 p3 = moveIndex32(fText, p3, 1);
    223                 c3 = (p3>=fText.length())? -1: UTF16.charAt(fText, p3);
    224 
    225                 if (p1 == p2) {
    226                     // Still warming up the loop.  (won't work with zero length strings, but we don't care)
    227                     continue;
    228                 }
    229                 if (p2 == fText.length()) {
    230                     // Reached end of string.  Always a break position.
    231                     break;
    232                 }
    233 
    234                 // Rule  GB3   CR x LF
    235                 //     No Extend or Format characters may appear between the CR and LF,
    236                 //     which requires the additional check for p2 immediately following p1.
    237                 //
    238                 if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
    239                     continue;
    240                 }
    241 
    242                 // Rule (GB4).   ( Control | CR | LF ) <break>
    243                 if (fControlSet.contains(c1) ||
    244                         c1 == 0x0D ||
    245                         c1 == 0x0A)  {
    246                     break;
    247                 }
    248 
    249                 // Rule (GB5)    <break>  ( Control | CR | LF )
    250                 //
    251                 if (fControlSet.contains(c2) ||
    252                         c2 == 0x0D ||
    253                         c2 == 0x0A)  {
    254                     break;
    255                 }
    256 
    257 
    258                 // Rule (GB6)  L x ( L | V | LV | LVT )
    259                 if (fLSet.contains(c1) &&
    260                         (fLSet.contains(c2)  ||
    261                                 fVSet.contains(c2)  ||
    262                                 fLVSet.contains(c2) ||
    263                                 fLVTSet.contains(c2))) {
    264                     continue;
    265                 }
    266 
    267                 // Rule (GB7)    ( LV | V )  x  ( V | T )
    268                 if ((fLVSet.contains(c1) || fVSet.contains(c1)) &&
    269                         (fVSet.contains(c2) || fTSet.contains(c2)))  {
    270                     continue;
    271                 }
    272 
    273                 // Rule (GB8)    ( LVT | T)  x T
    274                 if ((fLVTSet.contains(c1) || fTSet.contains(c1)) &&
    275                         fTSet.contains(c2))  {
    276                     continue;
    277                 }
    278 
    279                 // Rule (GB9)    x (Extend | ZWJ)
    280                 if (fExtendSet.contains(c2) || fZWJSet.contains(c2))  {
    281                     if (!fExtendSet.contains(c1)) {
    282                         cBase = c1;
    283                     }
    284                     continue;
    285                 }
    286 
    287                 // Rule (GB9a)   x  SpacingMark
    288                 if (fSpacingSet.contains(c2)) {
    289                     continue;
    290                 }
    291 
    292                 // Rule (GB9b)   Prepend x
    293                 if (fPrependSet.contains(c1)) {
    294                     continue;
    295                 }
    296                 // Rule (GB10)   (Emoji_Base | EBG) Extend* x Emoji_Modifier
    297                 if ((fEmojiBaseSet.contains(c1) || fEBGSet.contains(c1)) && fEmojiModifierSet.contains(c2)) {
    298                     continue;
    299                 }
    300                 if ((fEmojiBaseSet.contains(cBase) || fEBGSet.contains(cBase)) &&
    301                         fExtendSet.contains(c1) && fEmojiModifierSet.contains(c2)) {
    302                     continue;
    303                 }
    304 
    305                 // Rule (GB11)   (Extended_Pictographic | Emoji) ZWJ x (Extended_Pictographic | Emoji)
    306                 if ((fExtendedPictSet.contains(c0) || fEmojiNRKSet.contains(c0)) && fZWJSet.contains(c1) &&
    307                         (fExtendedPictSet.contains(c2) || fEmojiNRKSet.contains(c2))) {
    308                     continue;
    309                 }
    310                 if ((fExtendedPictSet.contains(cBase) || fEmojiNRKSet.contains(cBase)) && fExtendSet.contains(c0) && fZWJSet.contains(c1) &&
    311                         (fExtendedPictSet.contains(c2) || fEmojiNRKSet.contains(c2))) {
    312                     continue;
    313                 }
    314 
    315                 // Rule (GB12-13)   Regional_Indicator x Regional_Indicator
    316                 //                  Note: The first if condition is a little tricky. We only need to force
    317                 //                      a break if there are three or more contiguous RIs. If there are
    318                 //                      only two, a break following will occur via other rules, and will include
    319                 //                      any trailing extend characters, which is needed behavior.
    320                 if (fRegionalIndicatorSet.contains(c0) && fRegionalIndicatorSet.contains(c1)
    321                         && fRegionalIndicatorSet.contains(c2)) {
    322                     break;
    323                 }
    324                 if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) {
    325                     continue;
    326                 }
    327 
    328                 // Rule (GB999)  Any  <break>  Any
    329                 break;
    330             }
    331 
    332             breakPos = p2;
    333             return breakPos;
    334         }
    335     }
    336 
    337 
    338     /**
    339      *
    340      * Word Monkey Test Class
    341      *
    342      *
    343      *
    344      */
    345     static class RBBIWordMonkey extends RBBIMonkeyKind {
    346         List                      fSets;
    347         StringBuffer              fText;
    348 
    349         UnicodeSet                fCRSet;
    350         UnicodeSet                fLFSet;
    351         UnicodeSet                fNewlineSet;
    352         UnicodeSet                fRegionalIndicatorSet;
    353         UnicodeSet                fKatakanaSet;
    354         UnicodeSet                fHebrew_LetterSet;
    355         UnicodeSet                fALetterSet;
    356         UnicodeSet                fSingle_QuoteSet;
    357         UnicodeSet                fDouble_QuoteSet;
    358         UnicodeSet                fMidNumLetSet;
    359         UnicodeSet                fMidLetterSet;
    360         UnicodeSet                fMidNumSet;
    361         UnicodeSet                fNumericSet;
    362         UnicodeSet                fFormatSet;
    363         UnicodeSet                fExtendSet;
    364         UnicodeSet                fExtendNumLetSet;
    365         UnicodeSet                fOtherSet;
    366         UnicodeSet                fDictionarySet;
    367         UnicodeSet                fEBaseSet;
    368         UnicodeSet                fEBGSet;
    369         UnicodeSet                fEModifierSet;
    370         UnicodeSet                fZWJSet;
    371         UnicodeSet                fExtendedPictSet;
    372         UnicodeSet                fEmojiNRKSet;
    373 
    374 
    375         RBBIWordMonkey() {
    376             fCharProperty    = UProperty.WORD_BREAK;
    377 
    378             fCRSet           = new UnicodeSet("[\\p{Word_Break = CR}]");
    379             fLFSet           = new UnicodeSet("[\\p{Word_Break = LF}]");
    380             fNewlineSet      = new UnicodeSet("[\\p{Word_Break = Newline}]");
    381             fRegionalIndicatorSet = new UnicodeSet("[\\p{Word_Break = Regional_Indicator}]");
    382             fKatakanaSet     = new UnicodeSet("[\\p{Word_Break = Katakana}]");
    383             fHebrew_LetterSet = new UnicodeSet("[\\p{Word_Break = Hebrew_Letter}]");
    384             fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}]");
    385             fSingle_QuoteSet = new UnicodeSet("[\\p{Word_Break = Single_Quote}]");
    386             fDouble_QuoteSet = new UnicodeSet("[\\p{Word_Break = Double_Quote}]");
    387             fMidNumLetSet    = new UnicodeSet("[\\p{Word_Break = MidNumLet}]");
    388             fMidLetterSet    = new UnicodeSet("[\\p{Word_Break = MidLetter}]");
    389             fMidNumSet       = new UnicodeSet("[\\p{Word_Break = MidNum}]");
    390             fNumericSet      = new UnicodeSet("[\\p{Word_Break = Numeric}]");
    391             fFormatSet       = new UnicodeSet("[\\p{Word_Break = Format}]");
    392             fExtendNumLetSet = new UnicodeSet("[\\p{Word_Break = ExtendNumLet}]");
    393             fExtendSet       = new UnicodeSet("[\\p{Word_Break = Extend}]");
    394             fEBaseSet        = new UnicodeSet("[\\p{Word_Break = EB}]");
    395             fEBGSet          = new UnicodeSet("[\\p{Word_Break = EBG}]");
    396             fEModifierSet    = new UnicodeSet("[\\p{Word_Break = EM}]");
    397             fZWJSet          = new UnicodeSet("[\\p{Word_Break = ZWJ}]");
    398             fExtendedPictSet = new UnicodeSet(gExtended_Pict);
    399             fEmojiNRKSet     = new UnicodeSet("[[\\p{Emoji}]-[\\p{Grapheme_Cluster_Break = Regional_Indicator}*#0-9]]");
    400 
    401             fDictionarySet = new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]");
    402             fDictionarySet.addAll(fKatakanaSet);
    403             fDictionarySet.addAll(new UnicodeSet("[\\p{LineBreak = Complex_Context}]"));
    404 
    405             fALetterSet.removeAll(fDictionarySet);
    406 
    407             fOtherSet        = new UnicodeSet();
    408             fOtherSet.complement();
    409             fOtherSet.removeAll(fCRSet);
    410             fOtherSet.removeAll(fLFSet);
    411             fOtherSet.removeAll(fNewlineSet);
    412             fOtherSet.removeAll(fALetterSet);
    413             fOtherSet.removeAll(fSingle_QuoteSet);
    414             fOtherSet.removeAll(fDouble_QuoteSet);
    415             fOtherSet.removeAll(fKatakanaSet);
    416             fOtherSet.removeAll(fHebrew_LetterSet);
    417             fOtherSet.removeAll(fMidLetterSet);
    418             fOtherSet.removeAll(fMidNumSet);
    419             fOtherSet.removeAll(fNumericSet);
    420             fOtherSet.removeAll(fFormatSet);
    421             fOtherSet.removeAll(fExtendSet);
    422             fOtherSet.removeAll(fExtendNumLetSet);
    423             fOtherSet.removeAll(fRegionalIndicatorSet);
    424             fOtherSet.removeAll(fEBaseSet);
    425             fOtherSet.removeAll(fEBGSet);
    426             fOtherSet.removeAll(fEModifierSet);
    427             fOtherSet.removeAll(fZWJSet);
    428             fOtherSet.removeAll(fExtendedPictSet);
    429             fOtherSet.removeAll(fEmojiNRKSet);
    430 
    431             // Inhibit dictionary characters from being tested at all.
    432             // remove surrogates so as to not generate higher CJK characters
    433             fOtherSet.removeAll(new UnicodeSet("[[\\p{LineBreak = Complex_Context}][:Line_Break=Surrogate:]]"));
    434             fOtherSet.removeAll(fDictionarySet);
    435 
    436             fSets            = new ArrayList();
    437             fSets.add(fCRSet);
    438             fSets.add(fLFSet);
    439             fSets.add(fNewlineSet);
    440             fSets.add(fRegionalIndicatorSet);
    441             fSets.add(fHebrew_LetterSet);
    442             fSets.add(fALetterSet);
    443             //fSets.add(fKatakanaSet);  // Omit Katakana from fSets, which omits Katakana characters
    444             // from the test data. They are all in the dictionary set,
    445             // which this (old, to be retired) monkey test cannot handle.
    446             fSets.add(fSingle_QuoteSet);
    447             fSets.add(fDouble_QuoteSet);
    448             fSets.add(fMidLetterSet);
    449             fSets.add(fMidNumLetSet);
    450             fSets.add(fMidNumSet);
    451             fSets.add(fNumericSet);
    452             fSets.add(fFormatSet);
    453             fSets.add(fExtendSet);
    454             fSets.add(fExtendNumLetSet);
    455             fSets.add(fRegionalIndicatorSet);
    456             fSets.add(fEBaseSet);
    457             fSets.add(fEBGSet);
    458             fSets.add(fEModifierSet);
    459             fSets.add(fZWJSet);
    460             fSets.add(fExtendedPictSet);
    461             fSets.add(fEmojiNRKSet);
    462             fSets.add(fOtherSet);
    463         }
    464 
    465 
    466         @Override
    467         List  charClasses() {
    468             return fSets;
    469         }
    470 
    471         @Override
    472         void   setText(StringBuffer s) {
    473             fText = s;
    474         }
    475 
    476         @Override
    477         int   next(int prevPos) {
    478             int    /*p0,*/ p1, p2, p3;      // Indices of the significant code points around the
    479             //   break position being tested.  The candidate break
    480             //   location is before p2.
    481             int     breakPos = -1;
    482 
    483             int c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
    484 
    485             // Previous break at end of string.  return DONE.
    486             if (prevPos >= fText.length()) {
    487                 return -1;
    488             }
    489             /*p0 =*/ p1 = p2 = p3 = prevPos;
    490             c3 = UTF16.charAt(fText, prevPos);
    491             c0 = c1 = c2 = 0;
    492 
    493 
    494 
    495             // Loop runs once per "significant" character position in the input text.
    496             for (;;) {
    497                 // Move all of the positions forward in the input string.
    498                 /*p0 = p1;*/  c0 = c1;
    499                 p1 = p2;  c1 = c2;
    500                 p2 = p3;  c2 = c3;
    501 
    502                 // Advance p3 by    X(Extend | Format)*   Rule 4
    503                 //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
    504                 do {
    505                     p3 = moveIndex32(fText, p3, 1);
    506                     c3 = -1;
    507                     if (p3>=fText.length()) {
    508                         break;
    509                     }
    510                     c3 = UTF16.charAt(fText, p3);
    511                     if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) {
    512                         break;
    513                     }
    514                 }
    515                 while (setContains(fFormatSet, c3) || setContains(fExtendSet, c3) || setContains(fZWJSet, c3));
    516 
    517                 if (p1 == p2) {
    518                     // Still warming up the loop.  (won't work with zero length strings, but we don't care)
    519                     continue;
    520                 }
    521                 if (p2 == fText.length()) {
    522                     // Reached end of string.  Always a break position.
    523                     break;
    524                 }
    525 
    526                 // Rule (3)   CR x LF
    527                 //     No Extend or Format characters may appear between the CR and LF,
    528                 //     which requires the additional check for p2 immediately following p1.
    529                 //
    530                 if (c1==0x0D && c2==0x0A) {
    531                     continue;
    532                 }
    533 
    534                 // Rule (3a)  Break before and after newlines (including CR and LF)
    535                 //
    536                 if (fCRSet.contains(c1) || fLFSet.contains(c1) || fNewlineSet.contains(c1)) {
    537                     break;
    538                 }
    539                 if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) {
    540                     break;
    541                 }
    542 
    543                 // Rule (3c)    ZWJ x (Extended_Pictographic | Emoji).
    544                 //              Not ignoring extend chars, so peek into input text to
    545                 //              get the potential ZWJ, the character immediately preceding c2.
    546                 if (fZWJSet.contains(fText.codePointBefore(p2)) && (fExtendedPictSet.contains(c2) || fEmojiNRKSet.contains(c2))) {
    547                     continue;
    548                 }
    549 
    550                 // Rule (5).   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
    551                 if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) &&
    552                         (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2)))  {
    553                     continue;
    554                 }
    555 
    556                 // Rule (6)  (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
    557                 //
    558                 if ( (fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1))   &&
    559                         (fMidLetterSet.contains(c2) || fMidNumLetSet.contains(c2) || fSingle_QuoteSet.contains(c2)) &&
    560                         (setContains(fALetterSet, c3) || setContains(fHebrew_LetterSet, c3))) {
    561                     continue;
    562                 }
    563 
    564                 // Rule (7)  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)
    565                 if ((fALetterSet.contains(c0) || fHebrew_LetterSet.contains(c0)) &&
    566                         (fMidLetterSet.contains(c1) || fMidNumLetSet.contains(c1) || fSingle_QuoteSet.contains(c1)) &&
    567                         (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2))) {
    568                     continue;
    569                 }
    570 
    571                 // Rule (7a)     Hebrew_Letter x Single_Quote
    572                 if (fHebrew_LetterSet.contains(c1) && fSingle_QuoteSet.contains(c2)) {
    573                     continue;
    574                 }
    575 
    576                 // Rule (7b)    Hebrew_Letter x Double_Quote Hebrew_Letter
    577                 if (fHebrew_LetterSet.contains(c1) && fDouble_QuoteSet.contains(c2) && setContains(fHebrew_LetterSet,c3)) {
    578                     continue;
    579                 }
    580 
    581                 // Rule (7c)    Hebrew_Letter Double_Quote x Hebrew_Letter
    582                 if (fHebrew_LetterSet.contains(c0) && fDouble_QuoteSet.contains(c1) && fHebrew_LetterSet.contains(c2)) {
    583                     continue;
    584                 }
    585 
    586                 //  Rule (8)    Numeric x Numeric
    587                 if (fNumericSet.contains(c1) &&
    588                         fNumericSet.contains(c2))  {
    589                     continue;
    590                 }
    591 
    592                 // Rule (9)    (ALetter | Hebrew_Letter) x Numeric
    593                 if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) &&
    594                         fNumericSet.contains(c2))  {
    595                     continue;
    596                 }
    597 
    598                 // Rule (10)    Numeric x (ALetter | Hebrew_Letter)
    599                 if (fNumericSet.contains(c1) &&
    600                         (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2)))  {
    601                     continue;
    602                 }
    603 
    604                 // Rule (11)   Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric
    605                 if (fNumericSet.contains(c0) &&
    606                         (fMidNumSet.contains(c1) || fMidNumLetSet.contains(c1) || fSingle_QuoteSet.contains(c1))  &&
    607                         fNumericSet.contains(c2)) {
    608                     continue;
    609                 }
    610 
    611                 // Rule (12)  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
    612                 if (fNumericSet.contains(c1) &&
    613                         (fMidNumSet.contains(c2) || fMidNumLetSet.contains(c2) || fSingle_QuoteSet.contains(c2))  &&
    614                         setContains(fNumericSet, c3)) {
    615                     continue;
    616                 }
    617 
    618                 // Rule (13)  Katakana x Katakana
    619                 //            Note: matches UAX 29 rules, but doesn't come into play for ICU because
    620                 //                  all Katakana are handled by the dictionary breaker.
    621                 if (fKatakanaSet.contains(c1) &&
    622                         fKatakanaSet.contains(c2))  {
    623                     continue;
    624                 }
    625 
    626                 // Rule 13a    (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
    627                 if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1) ||fNumericSet.contains(c1) ||
    628                         fKatakanaSet.contains(c1) || fExtendNumLetSet.contains(c1)) &&
    629                         fExtendNumLetSet.contains(c2)) {
    630                     continue;
    631                 }
    632 
    633                 // Rule 13b   ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
    634                 if (fExtendNumLetSet.contains(c1) &&
    635                         (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2) ||
    636                                 fNumericSet.contains(c2) || fKatakanaSet.contains(c2)))  {
    637                     continue;
    638                 }
    639 
    640 
    641                 // Rule 14 (E_Base | EBG) x E_Modifier
    642                 if ((fEBaseSet.contains(c1)  || fEBGSet.contains(c1)) && fEModifierSet.contains(c2)) {
    643                     continue;
    644                 }
    645 
    646                 // Rule 15 - 17   Group piars of Regional Indicators
    647                 if (fRegionalIndicatorSet.contains(c0) && fRegionalIndicatorSet.contains(c1)) {
    648                     break;
    649                 }
    650                 if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) {
    651                     continue;
    652                 }
    653 
    654                 // Rule 999.  Break found here.
    655                 break;
    656             }
    657 
    658             breakPos = p2;
    659             return breakPos;
    660         }
    661 
    662     }
    663 
    664 
    665     static class RBBILineMonkey extends RBBIMonkeyKind {
    666 
    667         List        fSets;
    668 
    669         // UnicodeSets for each of the Line Breaking character classes.
    670         // Order matches that of Unicode UAX 14, Table 1, which makes it a little easier
    671         // to verify that they are all accounted for.
    672 
    673         UnicodeSet  fBK;
    674         UnicodeSet  fCR;
    675         UnicodeSet  fLF;
    676         UnicodeSet  fCM;
    677         UnicodeSet  fNL;
    678         UnicodeSet  fSG;
    679         UnicodeSet  fWJ;
    680         UnicodeSet  fZW;
    681         UnicodeSet  fGL;
    682         UnicodeSet  fSP;
    683         UnicodeSet  fB2;
    684         UnicodeSet  fBA;
    685         UnicodeSet  fBB;
    686         UnicodeSet  fHY;
    687         UnicodeSet  fCB;
    688         UnicodeSet  fCL;
    689         UnicodeSet  fCP;
    690         UnicodeSet  fEX;
    691         UnicodeSet  fIN;
    692         UnicodeSet  fNS;
    693         UnicodeSet  fOP;
    694         UnicodeSet  fQU;
    695         UnicodeSet  fIS;
    696         UnicodeSet  fNU;
    697         UnicodeSet  fPO;
    698         UnicodeSet  fPR;
    699         UnicodeSet  fSY;
    700         UnicodeSet  fAI;
    701         UnicodeSet  fAL;
    702         UnicodeSet  fCJ;
    703         UnicodeSet  fH2;
    704         UnicodeSet  fH3;
    705         UnicodeSet  fHL;
    706         UnicodeSet  fID;
    707         UnicodeSet  fJL;
    708         UnicodeSet  fJV;
    709         UnicodeSet  fJT;
    710         UnicodeSet  fRI;
    711         UnicodeSet  fXX;
    712         UnicodeSet  fEB;
    713         UnicodeSet  fEM;
    714         UnicodeSet  fZWJ;
    715         UnicodeSet  fExtendedPict;
    716         UnicodeSet  fEmojiNRK;
    717 
    718         StringBuffer  fText;
    719         int           fOrigPositions;
    720 
    721 
    722 
    723         RBBILineMonkey()
    724         {
    725             fCharProperty  = UProperty.LINE_BREAK;
    726             fSets          = new ArrayList();
    727 
    728             fBK    = new UnicodeSet("[\\p{Line_Break=BK}]");
    729             fCR    = new UnicodeSet("[\\p{Line_break=CR}]");
    730             fLF    = new UnicodeSet("[\\p{Line_break=LF}]");
    731             fCM    = new UnicodeSet("[\\p{Line_break=CM}]");
    732             fNL    = new UnicodeSet("[\\p{Line_break=NL}]");
    733             fSG    = new UnicodeSet("[\\ud800-\\udfff]");
    734             fWJ    = new UnicodeSet("[\\p{Line_break=WJ}]");
    735             fZW    = new UnicodeSet("[\\p{Line_break=ZW}]");
    736             fGL    = new UnicodeSet("[\\p{Line_break=GL}]");
    737             fSP    = new UnicodeSet("[\\p{Line_break=SP}]");
    738             fB2    = new UnicodeSet("[\\p{Line_break=B2}]");
    739             fBA    = new UnicodeSet("[\\p{Line_break=BA}]");
    740             fBB    = new UnicodeSet("[\\p{Line_break=BB}]");
    741             fHY    = new UnicodeSet("[\\p{Line_break=HY}]");
    742             fCB    = new UnicodeSet("[\\p{Line_break=CB}]");
    743             fCL    = new UnicodeSet("[\\p{Line_break=CL}]");
    744             fCP    = new UnicodeSet("[\\p{Line_break=CP}]");
    745             fEX    = new UnicodeSet("[\\p{Line_break=EX}]");
    746             fIN    = new UnicodeSet("[\\p{Line_break=IN}]");
    747             fNS    = new UnicodeSet("[\\p{Line_break=NS}]");
    748             fOP    = new UnicodeSet("[\\p{Line_break=OP}]");
    749             fQU    = new UnicodeSet("[\\p{Line_break=QU}]");
    750             fIS    = new UnicodeSet("[\\p{Line_break=IS}]");
    751             fNU    = new UnicodeSet("[\\p{Line_break=NU}]");
    752             fPO    = new UnicodeSet("[\\p{Line_break=PO}]");
    753             fPR    = new UnicodeSet("[\\p{Line_break=PR}]");
    754             fSY    = new UnicodeSet("[\\p{Line_break=SY}]");
    755             fAI    = new UnicodeSet("[\\p{Line_break=AI}]");
    756             fAL    = new UnicodeSet("[\\p{Line_break=AL}]");
    757             fCJ    = new UnicodeSet("[\\p{Line_break=CJ}]");
    758             fH2    = new UnicodeSet("[\\p{Line_break=H2}]");
    759             fH3    = new UnicodeSet("[\\p{Line_break=H3}]");
    760             fHL    = new UnicodeSet("[\\p{Line_break=HL}]");
    761             fID    = new UnicodeSet("[\\p{Line_break=ID}]");
    762             fJL    = new UnicodeSet("[\\p{Line_break=JL}]");
    763             fJV    = new UnicodeSet("[\\p{Line_break=JV}]");
    764             fJT    = new UnicodeSet("[\\p{Line_break=JT}]");
    765             fRI    = new UnicodeSet("[\\p{Line_break=RI}]");
    766             fXX    = new UnicodeSet("[\\p{Line_break=XX}]");
    767             fEB    = new UnicodeSet("[\\p{Line_break=EB}]");
    768             fEM    = new UnicodeSet("[\\p{Line_break=EM}]");
    769             fZWJ   = new UnicodeSet("[\\p{Line_break=ZWJ}]");
    770             fEmojiNRK = new UnicodeSet("[[\\p{Emoji}]-[\\p{Line_break=RI}*#0-9]]");
    771             fExtendedPict = new UnicodeSet(gExtended_Pict);
    772 
    773 
    774             // Remove dictionary characters.
    775             // The monkey test reference implementation of line break does not replicate the dictionary behavior,
    776             // so dictionary characters are omitted from the monkey test data.
    777             @SuppressWarnings("unused")
    778             UnicodeSet dictionarySet = new UnicodeSet(
    779                     "[[:LineBreak = Complex_Context:] & [[:Script = Thai:][:Script = Lao:][:Script = Khmer:] [:script = Myanmar:]]]");
    780 
    781             fAL.addAll(fXX);     // Default behavior for XX is identical to AL
    782             fAL.addAll(fAI);     // Default behavior for AI is identical to AL
    783             fAL.addAll(fSG);     // Default behavior for SG (unpaired surrogates) is AL
    784 
    785             fNS.addAll(fCJ);     // Default behavior for CJ is identical to NS.
    786             fCM.addAll(fZWJ);    // ZWJ behaves as a CM.
    787 
    788             fSets.add(fBK);
    789             fSets.add(fCR);
    790             fSets.add(fLF);
    791             fSets.add(fCM);
    792             fSets.add(fNL);
    793             fSets.add(fWJ);
    794             fSets.add(fZW);
    795             fSets.add(fGL);
    796             fSets.add(fSP);
    797             fSets.add(fB2);
    798             fSets.add(fBA);
    799             fSets.add(fBB);
    800             fSets.add(fHY);
    801             fSets.add(fCB);
    802             fSets.add(fCL);
    803             fSets.add(fCP);
    804             fSets.add(fEX);
    805             fSets.add(fIN);
    806             fSets.add(fJL);
    807             fSets.add(fJT);
    808             fSets.add(fJV);
    809             fSets.add(fNS);
    810             fSets.add(fOP);
    811             fSets.add(fQU);
    812             fSets.add(fIS);
    813             fSets.add(fNU);
    814             fSets.add(fPO);
    815             fSets.add(fPR);
    816             fSets.add(fSY);
    817             fSets.add(fAI);
    818             fSets.add(fAL);
    819             fSets.add(fH2);
    820             fSets.add(fH3);
    821             fSets.add(fHL);
    822             fSets.add(fID);
    823             fSets.add(fWJ);
    824             fSets.add(fRI);
    825             fSets.add(fSG);
    826             fSets.add(fEB);
    827             fSets.add(fEM);
    828             fSets.add(fZWJ);
    829             fSets.add(fExtendedPict);
    830             fSets.add(fEmojiNRK);
    831         }
    832 
    833         @Override
    834         void setText(StringBuffer s) {
    835             fText       = s;
    836         }
    837 
    838 
    839 
    840 
    841         @Override
    842         int next(int startPos) {
    843             int    pos;       //  Index of the char following a potential break position
    844             int    thisChar;  //  Character at above position "pos"
    845 
    846             int    prevPos;   //  Index of the char preceding a potential break position
    847             int    prevChar;  //  Character at above position.  Note that prevChar
    848             //   and thisChar may not be adjacent because combining
    849             //   characters between them will be ignored.
    850             int    prevCharX2; //  Character before prevChar, more contex for LB 21a
    851 
    852             int    nextPos;   //  Index of the next character following pos.
    853             //     Usually skips over combining marks.
    854             int    tPos;      //  temp value.
    855             int    matchVals[]  = null;       // Number  Expression Match Results
    856 
    857 
    858             if (startPos >= fText.length()) {
    859                 return -1;
    860             }
    861 
    862 
    863             // Initial values for loop.  Loop will run the first time without finding breaks,
    864             //                           while the invalid values shift out and the "this" and
    865             //                           "prev" positions are filled in with good values.
    866             pos      = prevPos   = -1;    // Invalid value, serves as flag for initial loop iteration.
    867             thisChar = prevChar  = prevCharX2 = 0;
    868             nextPos  = startPos;
    869 
    870 
    871             // Loop runs once per position in the test text, until a break position
    872             //  is found.  In each iteration, we are testing for a possible break
    873             //  just preceding the character at index "pos".  The character preceding
    874             //  this char is at postion "prevPos"; because of combining sequences,
    875             //  "prevPos" can be arbitrarily far before "pos".
    876             for (;;) {
    877                 // Advance to the next position to be tested.
    878                 prevCharX2 = prevChar;
    879                 prevPos   = pos;
    880                 prevChar  = thisChar;
    881                 pos       = nextPos;
    882                 nextPos   = moveIndex32(fText, pos, 1);
    883 
    884                 // Rule LB2 - Break at end of text.
    885                 if (pos >= fText.length()) {
    886                     break;
    887                 }
    888 
    889                 // Rule LB 9 - adjust for combining sequences.
    890                 //             We do this rule out-of-order because the adjustment does
    891                 //             not effect the way that rules LB 3 through LB 6 match,
    892                 //             and doing it here rather than after LB 6 is substantially
    893                 //             simpler when combining sequences do occur.
    894 
    895 
    896                 // LB 9         Keep combining sequences together.
    897                 //              advance over any CM class chars at "pos",
    898                 //              result is "nextPos" for the following loop iteration.
    899                 thisChar  = UTF16.charAt(fText, pos);
    900                 if (!(fSP.contains(thisChar) || fBK.contains(thisChar) || thisChar==0x0d ||
    901                         thisChar==0x0a || fNL.contains(thisChar) || fZW.contains(thisChar) )) {
    902                     for (;;) {
    903                         if (nextPos == fText.length()) {
    904                             break;
    905                         }
    906                         int nextChar = UTF16.charAt(fText, nextPos);
    907                         if (!fCM.contains(nextChar)) {
    908                             break;
    909                         }
    910                         nextPos = moveIndex32(fText, nextPos, 1);
    911                     }
    912                 }
    913 
    914                 // LB 9 Treat X CM* as if it were X
    915                 //        No explicit action required.
    916 
    917                 // LB 10     Treat any remaining combining mark as AL
    918                 if (fCM.contains(thisChar)) {
    919                     thisChar = 'A';
    920                 }
    921 
    922 
    923                 // If the loop is still warming up - if we haven't shifted the initial
    924                 //   -1 positions out of prevPos yet - loop back to advance the
    925                 //    position in the input without any further looking for breaks.
    926                 if (prevPos == -1) {
    927                     continue;
    928                 }
    929 
    930                 // LB 4  Always break after hard line breaks,
    931                 if (fBK.contains(prevChar)) {
    932                     break;
    933                 }
    934 
    935                 // LB 5  Break after CR, LF, NL, but not inside CR LF
    936                 if (fCR.contains(prevChar) && fLF.contains(thisChar)) {
    937                     continue;
    938                 }
    939                 if  (fCR.contains(prevChar) ||
    940                         fLF.contains(prevChar) ||
    941                         fNL.contains(prevChar))  {
    942                     break;
    943                 }
    944 
    945                 // LB 6  Don't break before hard line breaks
    946                 if (fBK.contains(thisChar) || fCR.contains(thisChar) ||
    947                         fLF.contains(thisChar) || fNL.contains(thisChar) ) {
    948                     continue;
    949                 }
    950 
    951 
    952                 // LB 7  Don't break before spaces or zero-width space.
    953                 if (fSP.contains(thisChar)) {
    954                     continue;
    955                 }
    956 
    957                 if (fZW.contains(thisChar)) {
    958                     continue;
    959                 }
    960 
    961                 // LB 8  Break after zero width space
    962                 if (fZW.contains(prevChar)) {
    963                     break;
    964                 }
    965 
    966                 // LB 8a:  ZWJ x (ID | Extended_Pictographic | Emoji)
    967                 //       The monkey test's way of ignoring combining characters doesn't work
    968                 //       for this rule. ZWJ is also a CM. Need to get the actual character
    969                 //       preceding "thisChar", not ignoring combining marks, possibly ZWJ.
    970                 {
    971                     int prevC = fText.codePointBefore(pos);
    972                     if (fZWJ.contains(prevC) && (fID.contains(thisChar) || fExtendedPict.contains(thisChar) || fEmojiNRK.contains(thisChar))) {
    973                         continue;
    974                     }
    975                 }
    976 
    977                 //  LB 9, 10  Already done, at top of loop.
    978                 //
    979 
    980 
    981                 // LB 11
    982                 //    x  WJ
    983                 //    WJ  x
    984                 if (fWJ.contains(thisChar) || fWJ.contains(prevChar)) {
    985                     continue;
    986                 }
    987 
    988 
    989                 // LB 12
    990                 //        GL x
    991                 if (fGL.contains(prevChar)) {
    992                     continue;
    993                 }
    994 
    995                 // LB 12a
    996                 //    [^SP BA HY] x GL
    997                 if (!(fSP.contains(prevChar) ||
    998                         fBA.contains(prevChar) ||
    999                         fHY.contains(prevChar)     ) && fGL.contains(thisChar)) {
   1000                     continue;
   1001                 }
   1002 
   1003 
   1004 
   1005                 // LB 13  Don't break before closings.
   1006                 //       NU x CL, NU x CP  and NU x IS are not matched here so that they will
   1007                 //       fall into LB 17 and the more general number regular expression.
   1008                 //
   1009                 if (!fNU.contains(prevChar) && fCL.contains(thisChar) ||
   1010                         !fNU.contains(prevChar) && fCP.contains(thisChar) ||
   1011                         fEX.contains(thisChar) ||
   1012                         !fNU.contains(prevChar) && fIS.contains(thisChar) ||
   1013                         !fNU.contains(prevChar) && fSY.contains(thisChar))    {
   1014                     continue;
   1015                 }
   1016 
   1017                 // LB 14  Don't break after OP SP*
   1018                 //       Scan backwards, checking for this sequence.
   1019                 //       The OP char could include combining marks, so we actually check for
   1020                 //           OP CM* SP* x
   1021                 tPos = prevPos;
   1022                 if (fSP.contains(prevChar)) {
   1023                     while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
   1024                         tPos=moveIndex32(fText, tPos, -1);
   1025                     }
   1026                 }
   1027                 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
   1028                     tPos=moveIndex32(fText, tPos, -1);
   1029                 }
   1030                 if (fOP.contains(UTF16.charAt(fText, tPos))) {
   1031                     continue;
   1032                 }
   1033 
   1034                 // LB 15 Do not break within "[
   1035                 //       QU CM* SP* x OP
   1036                 if (fOP.contains(thisChar)) {
   1037                     // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
   1038                     tPos = prevPos;
   1039                     while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
   1040                         tPos = moveIndex32(fText, tPos, -1);
   1041                     }
   1042                     while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
   1043                         tPos = moveIndex32(fText, tPos, -1);
   1044                     }
   1045                     if (fQU.contains(UTF16.charAt(fText, tPos))) {
   1046                         continue;
   1047                     }
   1048                 }
   1049 
   1050                 // LB 16   (CL | CP) SP* x NS
   1051                 if (fNS.contains(thisChar)) {
   1052                     tPos = prevPos;
   1053                     while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
   1054                         tPos = moveIndex32(fText, tPos, -1);
   1055                     }
   1056                     while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
   1057                         tPos = moveIndex32(fText, tPos, -1);
   1058                     }
   1059                     if (fCL.contains(UTF16.charAt(fText, tPos)) || fCP.contains(UTF16.charAt(fText, tPos))) {
   1060                         continue;
   1061                     }
   1062                 }
   1063 
   1064 
   1065                 // LB 17        B2 SP* x B2
   1066                 if (fB2.contains(thisChar)) {
   1067                     tPos = prevPos;
   1068                     while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
   1069                         tPos = moveIndex32(fText, tPos, -1);
   1070                     }
   1071                     while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
   1072                         tPos = moveIndex32(fText, tPos, -1);
   1073                     }
   1074                     if (fB2.contains(UTF16.charAt(fText, tPos))) {
   1075                         continue;
   1076                     }
   1077                 }
   1078 
   1079                 // LB 18    break after space
   1080                 if (fSP.contains(prevChar)) {
   1081                     break;
   1082                 }
   1083 
   1084                 // LB 19
   1085                 //    x   QU
   1086                 //    QU  x
   1087                 if (fQU.contains(thisChar) || fQU.contains(prevChar)) {
   1088                     continue;
   1089                 }
   1090 
   1091                 // LB 20  Break around a CB
   1092                 if (fCB.contains(thisChar) || fCB.contains(prevChar)) {
   1093                     break;
   1094                 }
   1095 
   1096                 // LB 21
   1097                 if (fBA.contains(thisChar) ||
   1098                         fHY.contains(thisChar) ||
   1099                         fNS.contains(thisChar) ||
   1100                         fBB.contains(prevChar) )   {
   1101                     continue;
   1102                 }
   1103 
   1104                 // LB 21a, HL (HY | BA) x
   1105                 if (fHL.contains(prevCharX2) && (fHY.contains(prevChar) || fBA.contains(prevChar))) {
   1106                     continue;
   1107                 }
   1108 
   1109                 // LB 21b, SY x HL
   1110                 if (fSY.contains(prevChar) && fHL.contains(thisChar)) {
   1111                     continue;
   1112                 }
   1113 
   1114                 // LB 22
   1115                 if (fAL.contains(prevChar) && fIN.contains(thisChar) ||
   1116                         fEX.contains(prevChar) && fIN.contains(thisChar) ||
   1117                         fHL.contains(prevChar) && fIN.contains(thisChar) ||
   1118                         (fID.contains(prevChar) || fEB.contains(prevChar) || fEM.contains(prevChar)) && fIN.contains(thisChar) ||
   1119                         fIN.contains(prevChar) && fIN.contains(thisChar) ||
   1120                         fNU.contains(prevChar) && fIN.contains(thisChar) )   {
   1121                     continue;
   1122                 }
   1123 
   1124                 // LB 23    (AL | HL) x NU
   1125                 //          NU x (AL | HL)
   1126                 if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && fNU.contains(thisChar)) {
   1127                     continue;
   1128                 }
   1129                 if (fNU.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) {
   1130                     continue;
   1131                 }
   1132 
   1133                 // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
   1134                 //      PR x (ID | EB | EM)
   1135                 //     (ID | EB | EM) x PO
   1136                 if (fPR.contains(prevChar) &&
   1137                         (fID.contains(thisChar) || fEB.contains(thisChar) || fEM.contains(thisChar)))  {
   1138                     continue;
   1139                 }
   1140                 if ((fID.contains(prevChar) || fEB.contains(prevChar) || fEM.contains(prevChar)) &&
   1141                         fPO.contains(thisChar)) {
   1142                     continue;
   1143                 }
   1144 
   1145                 // LB 24  Do not break between prefix and letters or ideographs.
   1146                 //         (PR | PO) x (AL | HL)
   1147                 //         (AL | HL) x (PR | PO)
   1148                 if ((fPR.contains(prevChar) || fPO.contains(prevChar)) &&
   1149                         (fAL.contains(thisChar) || fHL.contains(thisChar))) {
   1150                     continue;
   1151                 }
   1152                 if ((fAL.contains(prevChar) || fHL.contains(prevChar)) &&
   1153                         (fPR.contains(thisChar) || fPO.contains(thisChar))) {
   1154                     continue;
   1155                 }
   1156 
   1157 
   1158                 // LB 25    Numbers
   1159                 matchVals = LBNumberCheck(fText, prevPos, matchVals);
   1160                 if (matchVals[0] != -1) {
   1161                     // Matched a number.  But could have been just a single digit, which would
   1162                     //    not represent a "no break here" between prevChar and thisChar
   1163                     int numEndIdx = matchVals[1];  // idx of first char following num
   1164                     if (numEndIdx > pos) {
   1165                         // Number match includes at least the two chars being checked
   1166                         if (numEndIdx > nextPos) {
   1167                             // Number match includes additional chars.  Update pos and nextPos
   1168                             //   so that next loop iteration will continue at the end of the number,
   1169                             //   checking for breaks between last char in number & whatever follows.
   1170                             nextPos = numEndIdx;
   1171                             pos     = numEndIdx;
   1172                             do {
   1173                                 pos = moveIndex32(fText, pos, -1);
   1174                                 thisChar = UTF16.charAt(fText, pos);
   1175                             }
   1176                             while (fCM.contains(thisChar));
   1177                         }
   1178                         continue;
   1179                     }
   1180                 }
   1181 
   1182 
   1183                 // LB 26  Do not break Korean Syllables
   1184                 if (fJL.contains(prevChar) && (fJL.contains(thisChar) ||
   1185                         fJV.contains(thisChar) ||
   1186                         fH2.contains(thisChar) ||
   1187                         fH3.contains(thisChar))) {
   1188                     continue;
   1189                 }
   1190 
   1191                 if ((fJV.contains(prevChar) || fH2.contains(prevChar))  &&
   1192                         (fJV.contains(thisChar) || fJT.contains(thisChar))) {
   1193                     continue;
   1194                 }
   1195 
   1196                 if ((fJT.contains(prevChar) || fH3.contains(prevChar)) &&
   1197                         fJT.contains(thisChar)) {
   1198                     continue;
   1199                 }
   1200 
   1201                 // LB 27 Treat a Korean Syllable Block the same as ID
   1202                 if ((fJL.contains(prevChar) || fJV.contains(prevChar) ||
   1203                         fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) &&
   1204                         fIN.contains(thisChar)) {
   1205                     continue;
   1206                 }
   1207                 if ((fJL.contains(prevChar) || fJV.contains(prevChar) ||
   1208                         fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) &&
   1209                         fPO.contains(thisChar)) {
   1210                     continue;
   1211                 }
   1212                 if (fPR.contains(prevChar) && (fJL.contains(thisChar) || fJV.contains(thisChar) ||
   1213                         fJT.contains(thisChar) || fH2.contains(thisChar) || fH3.contains(thisChar))) {
   1214                     continue;
   1215                 }
   1216 
   1217 
   1218 
   1219                 // LB 28 Do not break between alphabetics
   1220                 if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && (fAL.contains(thisChar) || fHL.contains(thisChar))) {
   1221                     continue;
   1222                 }
   1223 
   1224                 // LB 29  Do not break between numeric punctuation and alphabetics
   1225                 if (fIS.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) {
   1226                     continue;
   1227                 }
   1228 
   1229                 // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
   1230                 //          (AL | NU) x OP
   1231                 //          CP x (AL | NU)
   1232                 if ((fAL.contains(prevChar) || fHL.contains(prevChar) || fNU.contains(prevChar)) && fOP.contains(thisChar)) {
   1233                     continue;
   1234                 }
   1235                 if (fCP.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar) || fNU.contains(thisChar))) {
   1236                     continue;
   1237                 }
   1238 
   1239                 // LB 30a   Break between pairs of Regional Indicators.
   1240                 //             RI RI <break> RI
   1241                 //             RI    x    RI
   1242                 if (fRI.contains(prevCharX2) && fRI.contains(prevChar) && fRI.contains(thisChar)) {
   1243                     break;
   1244                 }
   1245                 if (fRI.contains(prevChar) && fRI.contains(thisChar)) {
   1246                     continue;
   1247                 }
   1248 
   1249                 // LB30b    Emoji Base x Emoji Modifier
   1250                 if (fEB.contains(prevChar) && fEM.contains(thisChar)) {
   1251                     continue;
   1252                 }
   1253                 // LB 31    Break everywhere else
   1254                 break;
   1255             }
   1256 
   1257             return pos;
   1258         }
   1259 
   1260 
   1261 
   1262         // Match the following regular expression in the input text.
   1263         //    ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * ((CL | CP) CM*)?  (PR | PO) CM*)?
   1264         //      0    0   1       3    3    4              7    7    7    7      9    9    9     11   11    (match states)
   1265         //  retVals array  [0]  index of the start of the match, or -1 if no match
   1266         //                 [1]  index of first char following the match.
   1267         //  Can not use Java regex because need supplementary character support,
   1268         //     and because Unicode char properties version must be the same as in
   1269         //     the version of ICU being tested.
   1270         private int[] LBNumberCheck(StringBuffer s, int startIdx, int[] retVals) {
   1271             if (retVals == null) {
   1272                 retVals = new int[2];
   1273             }
   1274             retVals[0]     = -1;  // Indicates no match.
   1275             int matchState = 0;
   1276             int idx        = startIdx;
   1277 
   1278             matchLoop: for (idx = startIdx; idx<s.length(); idx = moveIndex32(s, idx, 1)){
   1279                 int c = UTF16.charAt(s, idx);
   1280                 int cLBType = UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK);
   1281                 switch (matchState) {
   1282                 case 0:
   1283                     if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC ||
   1284                     cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
   1285                         matchState = 1;
   1286                         break;
   1287                     }
   1288                     if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {
   1289                         matchState = 4;
   1290                         break;
   1291                     }
   1292                     if (cLBType == UCharacter.LineBreak.HYPHEN) {
   1293                         matchState = 4;
   1294                         break;
   1295                     }
   1296                     if (cLBType == UCharacter.LineBreak.NUMERIC) {
   1297                         matchState = 7;
   1298                         break;
   1299                     }
   1300                     break matchLoop;   /* No Match  */
   1301 
   1302                 case 1:
   1303                     if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) {
   1304                         matchState = 1;
   1305                         break;
   1306                     }
   1307                     if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {
   1308                         matchState = 4;
   1309                         break;
   1310                     }
   1311                     if (cLBType == UCharacter.LineBreak.HYPHEN) {
   1312                         matchState = 4;
   1313                         break;
   1314                     }
   1315                     if (cLBType == UCharacter.LineBreak.NUMERIC) {
   1316                         matchState = 7;
   1317                         break;
   1318                     }
   1319                     break matchLoop;   /* No Match  */
   1320 
   1321 
   1322                 case 4:
   1323                     if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) {
   1324                         matchState = 4;
   1325                         break;
   1326                     }
   1327                     if (cLBType == UCharacter.LineBreak.NUMERIC) {
   1328                         matchState = 7;
   1329                         break;
   1330                     }
   1331                     break matchLoop;   /* No Match  */
   1332                     //    ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)?  (PR | PO) CM*)?
   1333                     //      0    0   1       3    3    4              7    7    7    7      9   9     11   11    (match states)
   1334 
   1335                 case 7:
   1336                     if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) {
   1337                         matchState = 7;
   1338                         break;
   1339                     }
   1340                     if (cLBType == UCharacter.LineBreak.NUMERIC) {
   1341                         matchState = 7;
   1342                         break;
   1343                     }
   1344                     if (cLBType == UCharacter.LineBreak.INFIX_NUMERIC) {
   1345                         matchState = 7;
   1346                         break;
   1347                     }
   1348                     if (cLBType == UCharacter.LineBreak.BREAK_SYMBOLS) {
   1349                         matchState = 7;
   1350                         break;
   1351                     }
   1352                     if (cLBType == UCharacter.LineBreak.CLOSE_PUNCTUATION) {
   1353                         matchState = 9;
   1354                         break;
   1355                     }
   1356                     if (cLBType == UCharacter.LineBreak.CLOSE_PARENTHESIS) {
   1357                         matchState = 9;
   1358                         break;
   1359                     }
   1360                     if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
   1361                         matchState = 11;
   1362                         break;
   1363                     }
   1364                     if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {
   1365                         matchState = 11;
   1366                         break;
   1367                     }
   1368 
   1369                     break matchLoop;    // Match Complete.
   1370                 case 9:
   1371                     if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) {
   1372                         matchState = 9;
   1373                         break;
   1374                     }
   1375                     if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
   1376                         matchState = 11;
   1377                         break;
   1378                     }
   1379                     if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {
   1380                         matchState = 11;
   1381                         break;
   1382                     }
   1383                     break matchLoop;    // Match Complete.
   1384                 case 11:
   1385                     if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) {
   1386                         matchState = 11;
   1387                         break;
   1388                     }
   1389                     break matchLoop;    // Match Complete.
   1390                 }
   1391             }
   1392             if (matchState > 4) {
   1393                 retVals[0] = startIdx;
   1394                 retVals[1] = idx;
   1395             }
   1396             return retVals;
   1397         }
   1398 
   1399 
   1400         @Override
   1401         List  charClasses() {
   1402             return fSets;
   1403         }
   1404 
   1405 
   1406 
   1407     }
   1408 
   1409 
   1410     /**
   1411      *
   1412      * Sentence Monkey Test Class
   1413      *
   1414      *
   1415      *
   1416      */
   1417     static class RBBISentenceMonkey extends RBBIMonkeyKind {
   1418         List                 fSets;
   1419         StringBuffer         fText;
   1420 
   1421         UnicodeSet           fSepSet;
   1422         UnicodeSet           fFormatSet;
   1423         UnicodeSet           fSpSet;
   1424         UnicodeSet           fLowerSet;
   1425         UnicodeSet           fUpperSet;
   1426         UnicodeSet           fOLetterSet;
   1427         UnicodeSet           fNumericSet;
   1428         UnicodeSet           fATermSet;
   1429         UnicodeSet           fSContinueSet;
   1430         UnicodeSet           fSTermSet;
   1431         UnicodeSet           fCloseSet;
   1432         UnicodeSet           fOtherSet;
   1433         UnicodeSet           fExtendSet;
   1434 
   1435 
   1436 
   1437         RBBISentenceMonkey() {
   1438             fCharProperty  = UProperty.SENTENCE_BREAK;
   1439 
   1440             fSets            = new ArrayList();
   1441 
   1442             //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
   1443             //                       set and made into character classes of their own.  For the monkey impl,
   1444             //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
   1445             fSepSet          = new UnicodeSet("[\\p{Sentence_Break = Sep} \\u000a \\u000d]");
   1446             fFormatSet       = new UnicodeSet("[\\p{Sentence_Break = Format}]");
   1447             fSpSet           = new UnicodeSet("[\\p{Sentence_Break = Sp}]");
   1448             fLowerSet        = new UnicodeSet("[\\p{Sentence_Break = Lower}]");
   1449             fUpperSet        = new UnicodeSet("[\\p{Sentence_Break = Upper}]");
   1450             fOLetterSet      = new UnicodeSet("[\\p{Sentence_Break = OLetter}]");
   1451             fNumericSet      = new UnicodeSet("[\\p{Sentence_Break = Numeric}]");
   1452             fATermSet        = new UnicodeSet("[\\p{Sentence_Break = ATerm}]");
   1453             fSContinueSet    = new UnicodeSet("[\\p{Sentence_Break = SContinue}]");
   1454             fSTermSet        = new UnicodeSet("[\\p{Sentence_Break = STerm}]");
   1455             fCloseSet        = new UnicodeSet("[\\p{Sentence_Break = Close}]");
   1456             fExtendSet       = new UnicodeSet("[\\p{Sentence_Break = Extend}]");
   1457             fOtherSet        = new UnicodeSet();
   1458 
   1459 
   1460             fOtherSet.complement();
   1461             fOtherSet.removeAll(fSepSet);
   1462             fOtherSet.removeAll(fFormatSet);
   1463             fOtherSet.removeAll(fSpSet);
   1464             fOtherSet.removeAll(fLowerSet);
   1465             fOtherSet.removeAll(fUpperSet);
   1466             fOtherSet.removeAll(fOLetterSet);
   1467             fOtherSet.removeAll(fNumericSet);
   1468             fOtherSet.removeAll(fATermSet);
   1469             fOtherSet.removeAll(fSContinueSet);
   1470             fOtherSet.removeAll(fSTermSet);
   1471             fOtherSet.removeAll(fCloseSet);
   1472             fOtherSet.removeAll(fExtendSet);
   1473 
   1474             fSets.add(fSepSet);
   1475             fSets.add(fFormatSet);
   1476 
   1477             fSets.add(fSpSet);
   1478             fSets.add(fLowerSet);
   1479             fSets.add(fUpperSet);
   1480             fSets.add(fOLetterSet);
   1481             fSets.add(fNumericSet);
   1482             fSets.add(fATermSet);
   1483             fSets.add(fSContinueSet);
   1484             fSets.add(fSTermSet);
   1485             fSets.add(fCloseSet);
   1486             fSets.add(fOtherSet);
   1487             fSets.add(fExtendSet);
   1488         }
   1489 
   1490 
   1491         @Override
   1492         List  charClasses() {
   1493             return fSets;
   1494         }
   1495 
   1496         @Override
   1497         void   setText(StringBuffer s) {
   1498             fText = s;
   1499         }
   1500 
   1501 
   1502         //      moveBack()   Find the "significant" code point preceding the index i.
   1503         //      Skips over ($Extend | $Format)*
   1504         //
   1505         private int moveBack(int i) {
   1506 
   1507             if (i <= 0) {
   1508                 return -1;
   1509             }
   1510 
   1511             int      c;
   1512             int      j = i;
   1513             do {
   1514                 j = moveIndex32(fText, j, -1);
   1515                 c = UTF16.charAt(fText, j);
   1516             }
   1517             while (j>0 &&(fFormatSet.contains(c) || fExtendSet.contains(c)));
   1518             return j;
   1519         }
   1520 
   1521 
   1522         int moveForward(int i) {
   1523             if (i>=fText.length()) {
   1524                 return fText.length();
   1525             }
   1526             int   c;
   1527             int   j = i;
   1528             do {
   1529                 j = moveIndex32(fText, j, 1);
   1530                 c = cAt(j);
   1531             }
   1532             while (c>=0 && (fFormatSet.contains(c) || fExtendSet.contains(c)));
   1533             return j;
   1534 
   1535         }
   1536 
   1537         int cAt(int pos) {
   1538             if (pos<0 || pos>=fText.length()) {
   1539                 return -1;
   1540             }
   1541             return UTF16.charAt(fText, pos);
   1542         }
   1543 
   1544         @Override
   1545         int   next(int prevPos) {
   1546             int    /*p0,*/ p1, p2, p3;      // Indices of the significant code points around the
   1547             //   break position being tested.  The candidate break
   1548             //   location is before p2.
   1549             int     breakPos = -1;
   1550 
   1551             int c0, c1, c2, c3;         // The code points at p0, p1, p2 & p3.
   1552             int c;
   1553 
   1554             // Prev break at end of string.  return DONE.
   1555             if (prevPos >= fText.length()) {
   1556                 return -1;
   1557             }
   1558             /*p0 =*/ p1 = p2 = p3 = prevPos;
   1559             c3 = UTF16.charAt(fText, prevPos);
   1560             c0 = c1 = c2 = 0;
   1561 
   1562             // Loop runs once per "significant" character position in the input text.
   1563             for (;;) {
   1564                 // Move all of the positions forward in the input string.
   1565                 /*p0 = p1;*/  c0 = c1;
   1566                 p1 = p2;  c1 = c2;
   1567                 p2 = p3;  c2 = c3;
   1568 
   1569                 // Advancd p3 by  X(Extend | Format)*   Rule 4
   1570                 p3 = moveForward(p3);
   1571                 c3 = cAt(p3);
   1572 
   1573                 // Rule (3) CR x LF
   1574                 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
   1575                     continue;
   1576                 }
   1577 
   1578                 // Rule (4)    Sep  <break>
   1579                 if (fSepSet.contains(c1)) {
   1580                     p2 = p1+1;   // Separators don't combine with Extend or Format
   1581                     break;
   1582                 }
   1583 
   1584                 if (p2 >= fText.length()) {
   1585                     // Reached end of string.  Always a break position.
   1586                     break;
   1587                 }
   1588 
   1589                 if (p2 == prevPos) {
   1590                     // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   1591                     continue;
   1592                 }
   1593 
   1594                 // Rule (6).   ATerm x Numeric
   1595                 if (fATermSet.contains(c1) &&  fNumericSet.contains(c2))  {
   1596                     continue;
   1597                 }
   1598 
   1599                 // Rule (7).  (Upper | Lower) ATerm  x  Uppper
   1600                 if ((fUpperSet.contains(c0) || fLowerSet.contains(c0)) &&
   1601                         fATermSet.contains(c1) && fUpperSet.contains(c2)) {
   1602                     continue;
   1603                 }
   1604 
   1605                 // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep))* Lower
   1606                 //           Note:  Sterm | ATerm are added to the negated part of the expression by a
   1607                 //                  note to the Unicode 5.0 documents.
   1608                 int p8 = p1;
   1609                 while (p8>0 && fSpSet.contains(cAt(p8))) {
   1610                     p8 = moveBack(p8);
   1611                 }
   1612                 while (p8>0 && fCloseSet.contains(cAt(p8))) {
   1613                     p8 = moveBack(p8);
   1614                 }
   1615                 if (fATermSet.contains(cAt(p8))) {
   1616                     p8=p2;
   1617                     for (;;) {
   1618                         c = cAt(p8);
   1619                         if (c==-1 || fOLetterSet.contains(c) || fUpperSet.contains(c) ||
   1620                                 fLowerSet.contains(c) || fSepSet.contains(c) ||
   1621                                 fATermSet.contains(c) || fSTermSet.contains(c))
   1622                         {
   1623                             break;
   1624                         }
   1625                         p8 = moveForward(p8);
   1626                     }
   1627                     if (p8<fText.length() && fLowerSet.contains(cAt(p8))) {
   1628                         continue;
   1629                     }
   1630                 }
   1631 
   1632                 // Rule 8a  (STerm | ATerm) Close* Sp* x (SContinue | Sterm | ATerm)
   1633                 if (fSContinueSet.contains(c2) || fSTermSet.contains(c2) || fATermSet.contains(c2)) {
   1634                     p8 = p1;
   1635                     while (setContains(fSpSet, cAt(p8))) {
   1636                         p8 = moveBack(p8);
   1637                     }
   1638                     while (setContains(fCloseSet, cAt(p8))) {
   1639                         p8 = moveBack(p8);
   1640                     }
   1641                     c = cAt(p8);
   1642                     if (setContains(fSTermSet, c) || setContains(fATermSet, c)) {
   1643                         continue;
   1644                     }
   1645                 }
   1646 
   1647 
   1648                 // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
   1649                 int p9 = p1;
   1650                 while (p9>0 && fCloseSet.contains(cAt(p9))) {
   1651                     p9 = moveBack(p9);
   1652                 }
   1653                 c = cAt(p9);
   1654                 if ((fSTermSet.contains(c) || fATermSet.contains(c))) {
   1655                     if (fCloseSet.contains(c2) || fSpSet.contains(c2) || fSepSet.contains(c2)) {
   1656                         continue;
   1657                     }
   1658                 }
   1659 
   1660                 // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
   1661                 int p10 = p1;
   1662                 while (p10>0 && fSpSet.contains(cAt(p10))) {
   1663                     p10 = moveBack(p10);
   1664                 }
   1665                 while (p10>0 && fCloseSet.contains(cAt(p10))) {
   1666                     p10 = moveBack(p10);
   1667                 }
   1668                 if (fSTermSet.contains(cAt(p10)) || fATermSet.contains(cAt(p10))) {
   1669                     if (fSpSet.contains(c2) || fSepSet.contains(c2)) {
   1670                         continue;
   1671                     }
   1672                 }
   1673 
   1674                 // Rule (11)  (STerm | ATerm) Close* Sp*   <break>
   1675                 int p11 = p1;
   1676                 if (p11>0 && fSepSet.contains(cAt(p11))) {
   1677                     p11 = moveBack(p11);
   1678                 }
   1679                 while (p11>0 && fSpSet.contains(cAt(p11))) {
   1680                     p11 = moveBack(p11);
   1681                 }
   1682                 while (p11>0 && fCloseSet.contains(cAt(p11))) {
   1683                     p11 = moveBack(p11);
   1684                 }
   1685                 if (fSTermSet.contains(cAt(p11)) || fATermSet.contains(cAt(p11))) {
   1686                     break;
   1687                 }
   1688 
   1689                 //  Rule (12)  Any x Any
   1690                 continue;
   1691             }
   1692             breakPos = p2;
   1693             return breakPos;
   1694         }
   1695 
   1696 
   1697 
   1698     }
   1699 
   1700 
   1701     /**
   1702      * Move an index into a string by n code points.
   1703      *   Similar to UTF16.moveCodePointOffset, but without the exceptions, which were
   1704      *   complicating usage.
   1705      * @param s   a Text string
   1706      * @param pos The starting code unit index into the text string
   1707      * @param amt The amount to adjust the string by.
   1708      * @return    The adjusted code unit index, pinned to the string's length, or
   1709      *            unchanged if input index was outside of the string.
   1710      */
   1711     static int moveIndex32(StringBuffer s, int pos, int amt) {
   1712         int i;
   1713         char  c;
   1714         if (amt>0) {
   1715             for (i=0; i<amt; i++) {
   1716                 if (pos >= s.length()) {
   1717                     return s.length();
   1718                 }
   1719                 c = s.charAt(pos);
   1720                 pos++;
   1721                 if (UTF16.isLeadSurrogate(c) && pos < s.length()) {
   1722                     c = s.charAt(pos);
   1723                     if (UTF16.isTrailSurrogate(c)) {
   1724                         pos++;
   1725                     }
   1726                 }
   1727             }
   1728         } else {
   1729             for (i=0; i>amt; i--) {
   1730                 if (pos <= 0) {
   1731                     return 0;
   1732                 }
   1733                 pos--;
   1734                 c = s.charAt(pos);
   1735                 if (UTF16.isTrailSurrogate(c) && pos >= 0) {
   1736                     c = s.charAt(pos);
   1737                     if (UTF16.isLeadSurrogate(c)) {
   1738                         pos--;
   1739                     }
   1740                 }
   1741             }
   1742         }
   1743         return pos;
   1744     }
   1745 
   1746     /**
   1747      * No-exceptions form of UnicodeSet.contains(c).
   1748      *    Simplifies loops that terminate with an end-of-input character value.
   1749      * @param s  A unicode set
   1750      * @param c  A code point value
   1751      * @return   true if the set contains c.
   1752      */
   1753     static boolean setContains(UnicodeSet s, int c) {
   1754         if (c<0 || c>UTF16.CODEPOINT_MAX_VALUE ) {
   1755             return false;
   1756         }
   1757         return s.contains(c);
   1758     }
   1759 
   1760 
   1761     /**
   1762      * return the index of the next code point in the input text.
   1763      * @param i the preceding index
   1764      */
   1765     static int  nextCP(StringBuffer s, int i) {
   1766         if (i == -1) {
   1767             // End of Input indication.  Continue to return end value.
   1768             return -1;
   1769         }
   1770         int  retVal = i + 1;
   1771         if (retVal > s.length()) {
   1772             return -1;
   1773         }
   1774         int  c = UTF16.charAt(s, i);
   1775         if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && UTF16.isLeadSurrogate(s.charAt(i))) {
   1776             retVal++;
   1777         }
   1778         return retVal;
   1779     }
   1780 
   1781 
   1782     /**
   1783      * random number generator.  Not using Java's built-in Randoms for two reasons:
   1784      *    1.  Using this code allows obtaining the same sequences as those from the ICU4C monkey test.
   1785      *    2.  We need to get and restore the seed from values occurring in the middle
   1786      *        of a long sequence, to more easily reproduce failing cases.
   1787      */
   1788     private static int m_seed = 1;
   1789     private static int  m_rand()
   1790     {
   1791         m_seed = m_seed * 1103515245 + 12345;
   1792         return (m_seed >>> 16) % 32768;
   1793     }
   1794 
   1795     // Helper function for formatting error output.
   1796     //   Append a string into a fixed-size field in a StringBuffer.
   1797     //   Blank-pad the string if it is shorter than the field.
   1798     //   Truncate the source string if it is too long.
   1799     //
   1800     private static void appendToBuf(StringBuffer dest, String src, int fieldLen) {
   1801         int appendLen = src.length();
   1802         if (appendLen >= fieldLen) {
   1803             dest.append(src.substring(0, fieldLen));
   1804         } else {
   1805             dest.append(src);
   1806             while (appendLen < fieldLen) {
   1807                 dest.append(' ');
   1808                 appendLen++;
   1809             }
   1810         }
   1811     }
   1812 
   1813     // Helper function for formatting error output.
   1814     // Display a code point in "\\uxxxx" or "\Uxxxxxxxx" format
   1815     private static void appendCharToBuf(StringBuffer dest, int c, int fieldLen) {
   1816         String hexChars = "0123456789abcdef";
   1817         if (c < 0x10000) {
   1818             dest.append("\\u");
   1819             for (int bn=12; bn>=0; bn-=4) {
   1820                 dest.append(hexChars.charAt(((c)>>bn)&0xf));
   1821             }
   1822             appendToBuf(dest, " ", fieldLen-6);
   1823         } else {
   1824             dest.append("\\U");
   1825             for (int bn=28; bn>=0; bn-=4) {
   1826                 dest.append(hexChars.charAt(((c)>>bn)&0xf));
   1827             }
   1828             appendToBuf(dest, " ", fieldLen-10);
   1829 
   1830         }
   1831     }
   1832 
   1833     /**
   1834      *  Run a RBBI monkey test.  Common routine, for all break iterator types.
   1835      *    Parameters:
   1836      *       bi      - the break iterator to use
   1837      *       mk      - MonkeyKind, abstraction for obtaining expected results
   1838      *       name    - Name of test (char, word, etc.) for use in error messages
   1839      *       seed    - Seed for starting random number generator (parameter from user)
   1840      *       numIterations
   1841      */
   1842     void RunMonkey(BreakIterator  bi, RBBIMonkeyKind mk, String name, int  seed, int numIterations) {
   1843         int              TESTSTRINGLEN = 500;
   1844         StringBuffer     testText         = new StringBuffer();
   1845         int              numCharClasses;
   1846         List             chClasses;
   1847         int[]            expected         = new int[TESTSTRINGLEN*2 + 1];
   1848         int              expectedCount    = 0;
   1849         boolean[]        expectedBreaks   = new boolean[TESTSTRINGLEN*2 + 1];
   1850         boolean[]        forwardBreaks    = new boolean[TESTSTRINGLEN*2 + 1];
   1851         boolean[]        reverseBreaks    = new boolean[TESTSTRINGLEN*2 + 1];
   1852         boolean[]        isBoundaryBreaks = new boolean[TESTSTRINGLEN*2 + 1];
   1853         boolean[]        followingBreaks  = new boolean[TESTSTRINGLEN*2 + 1];
   1854         boolean[]        precedingBreaks  = new boolean[TESTSTRINGLEN*2 + 1];
   1855         int              i;
   1856         int              loopCount        = 0;
   1857         boolean          printTestData    = false;
   1858         boolean          printBreaksFromBI = false;
   1859 
   1860         m_seed = seed;
   1861 
   1862         numCharClasses = mk.charClasses().size();
   1863         chClasses      = mk.charClasses();
   1864 
   1865         // Verify that the character classes all have at least one member.
   1866         for (i=0; i<numCharClasses; i++) {
   1867             UnicodeSet s = (UnicodeSet)chClasses.get(i);
   1868             if (s == null || s.size() == 0) {
   1869                 errln("Character Class " + i + " is null or of zero size.");
   1870                 return;
   1871             }
   1872         }
   1873 
   1874         //--------------------------------------------------------------------------------------------
   1875         //
   1876         //  Debugging settings.  Comment out everything in the following block for normal operation
   1877         //
   1878         //--------------------------------------------------------------------------------------------
   1879         // numIterations = -1;
   1880         // numIterations = 10000;   // Same as exhaustive.
   1881         // RuleBasedBreakIterator_New.fTrace = true;
   1882         // m_seed = 859056465;
   1883         // TESTSTRINGLEN = 50;
   1884         // printTestData = true;
   1885         // printBreaksFromBI = true;
   1886         // ((RuleBasedBreakIterator_New)bi).dump();
   1887 
   1888         //--------------------------------------------------------------------------------------------
   1889         //
   1890         //  End of Debugging settings.
   1891         //
   1892         //--------------------------------------------------------------------------------------------
   1893 
   1894         int  dotsOnLine = 0;
   1895         while (loopCount < numIterations || numIterations == -1) {
   1896             if (numIterations == -1 && loopCount % 10 == 0) {
   1897                 // If test is running in an infinite loop, display a periodic tic so
   1898                 //   we can tell that it is making progress.
   1899                 System.out.print(".");
   1900                 if (dotsOnLine++ >= 80){
   1901                     System.out.println();
   1902                     dotsOnLine = 0;
   1903                 }
   1904             }
   1905             // Save current random number seed, so that we can recreate the random numbers
   1906             //   for this loop iteration in event of an error.
   1907             seed = m_seed;
   1908 
   1909             testText.setLength(0);
   1910             // Populate a test string with data.
   1911             if (printTestData) {
   1912                 System.out.println("Test Data string ...");
   1913             }
   1914             for (i=0; i<TESTSTRINGLEN; i++) {
   1915                 int        aClassNum = m_rand() % numCharClasses;
   1916                 UnicodeSet classSet  = (UnicodeSet)chClasses.get(aClassNum);
   1917                 int        charIdx   = m_rand() % classSet.size();
   1918                 int        c         = classSet.charAt(charIdx);
   1919                 if (c < 0) {   // TODO:  deal with sets containing strings.
   1920                     errln("c < 0");
   1921                 }
   1922                 UTF16.appendCodePoint(testText, c);
   1923                 if (printTestData) {
   1924                     System.out.print(Integer.toHexString(c) + " ");
   1925                 }
   1926             }
   1927             if (printTestData) {
   1928                 System.out.println();
   1929             }
   1930 
   1931             Arrays.fill(expected, 0);
   1932             Arrays.fill(expectedBreaks, false);
   1933             Arrays.fill(forwardBreaks, false);
   1934             Arrays.fill(reverseBreaks, false);
   1935             Arrays.fill(isBoundaryBreaks, false);
   1936             Arrays.fill(followingBreaks, false);
   1937             Arrays.fill(precedingBreaks, false);
   1938 
   1939             // Calculate the expected results for this test string.
   1940             mk.setText(testText);
   1941             expectedCount = 0;
   1942             expectedBreaks[0] = true;
   1943             expected[expectedCount ++] = 0;
   1944             int breakPos = 0;
   1945             int lastBreakPos = -1;
   1946             for (;;) {
   1947                 lastBreakPos = breakPos;
   1948                 breakPos = mk.next(breakPos);
   1949                 if (breakPos == -1) {
   1950                     break;
   1951                 }
   1952                 if (breakPos > testText.length()) {
   1953                     errln("breakPos > testText.length()");
   1954                 }
   1955                 if (lastBreakPos >= breakPos) {
   1956                     errln("Next() not increasing.");
   1957                     // break;
   1958                 }
   1959                 expectedBreaks[breakPos] = true;
   1960                 expected[expectedCount ++] = breakPos;
   1961             }
   1962 
   1963             // Find the break positions using forward iteration
   1964             if (printBreaksFromBI) {
   1965                 System.out.println("Breaks from BI...");
   1966             }
   1967             bi.setText(testText.toString());
   1968             for (i=bi.first(); i != BreakIterator.DONE; i=bi.next()) {
   1969                 if (i < 0 || i > testText.length()) {
   1970                     errln(name + " break monkey test: Out of range value returned by breakIterator::next()");
   1971                     break;
   1972                 }
   1973                 if (printBreaksFromBI) {
   1974                     System.out.print(Integer.toHexString(i) + " ");
   1975                 }
   1976                 forwardBreaks[i] = true;
   1977             }
   1978             if (printBreaksFromBI) {
   1979                 System.out.println();
   1980             }
   1981 
   1982             // Find the break positions using reverse iteration
   1983             for (i=bi.last(); i != BreakIterator.DONE; i=bi.previous()) {
   1984                 if (i < 0 || i > testText.length()) {
   1985                     errln(name + " break monkey test: Out of range value returned by breakIterator.next()" + name);
   1986                     break;
   1987                 }
   1988                 reverseBreaks[i] = true;
   1989             }
   1990 
   1991             // Find the break positions using isBoundary() tests.
   1992             for (i=0; i<=testText.length(); i++) {
   1993                 isBoundaryBreaks[i] = bi.isBoundary(i);
   1994             }
   1995 
   1996             // Find the break positions using the following() function.
   1997             lastBreakPos = 0;
   1998             followingBreaks[0] = true;
   1999             for (i=0; i<testText.length(); i++) {
   2000                 breakPos = bi.following(i);
   2001                 if (breakPos <= i ||
   2002                         breakPos < lastBreakPos ||
   2003                         breakPos > testText.length() ||
   2004                         breakPos > lastBreakPos && lastBreakPos > i ) {
   2005                     errln(name + " break monkey test: " +
   2006                             "Out of range value returned by BreakIterator::following().\n" +
   2007                             "index=" + i + "following returned=" + breakPos +
   2008                             "lastBreak=" + lastBreakPos);
   2009                     precedingBreaks[i] = !expectedBreaks[i];   // Forces an error.
   2010                 } else {
   2011                     followingBreaks[breakPos] = true;
   2012                     lastBreakPos = breakPos;
   2013                 }
   2014             }
   2015 
   2016             // Find the break positions using the preceding() function.
   2017             lastBreakPos = testText.length();
   2018             precedingBreaks[testText.length()] = true;
   2019             for (i=testText.length(); i>0; i--) {
   2020                 breakPos = bi.preceding(i);
   2021                 if (breakPos >= i ||
   2022                         breakPos > lastBreakPos ||
   2023                         breakPos < 0 ||
   2024                         breakPos < lastBreakPos && lastBreakPos < i ) {
   2025                     errln(name + " break monkey test: " +
   2026                             "Out of range value returned by BreakIterator::preceding().\n" +
   2027                             "index=" + i + "preceding returned=" + breakPos +
   2028                             "lastBreak=" + lastBreakPos);
   2029                     precedingBreaks[i] = !expectedBreaks[i];   // Forces an error.
   2030                 } else {
   2031                     precedingBreaks[breakPos] = true;
   2032                     lastBreakPos = breakPos;
   2033                 }
   2034             }
   2035 
   2036 
   2037 
   2038             // Compare the expected and actual results.
   2039             for (i=0; i<=testText.length(); i++) {
   2040                 String errorType = null;
   2041                 if  (forwardBreaks[i] != expectedBreaks[i]) {
   2042                     errorType = "next()";
   2043                 } else if (reverseBreaks[i] != forwardBreaks[i]) {
   2044                     errorType = "previous()";
   2045                 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
   2046                     errorType = "isBoundary()";
   2047                 } else if (followingBreaks[i] != expectedBreaks[i]) {
   2048                     errorType = "following()";
   2049                 } else if (precedingBreaks[i] != expectedBreaks[i]) {
   2050                     errorType = "preceding()";
   2051                 }
   2052 
   2053                 if (errorType != null) {
   2054                     // Format a range of the test text that includes the failure as
   2055                     //  a data item that can be included in the rbbi test data file.
   2056 
   2057                     // Start of the range is the last point where expected and actual results
   2058                     //   both agreed that there was a break position.
   2059                     int startContext = i;
   2060                     int count = 0;
   2061                     for (;;) {
   2062                         if (startContext==0) { break; }
   2063                         startContext --;
   2064                         if (expectedBreaks[startContext]) {
   2065                             if (count == 2) break;
   2066                             count ++;
   2067                         }
   2068                     }
   2069 
   2070                     // End of range is two expected breaks past the start position.
   2071                     int endContext = i + 1;
   2072                     int ci;
   2073                     for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
   2074                         for (;;) {
   2075                             if (endContext >= testText.length()) {break;}
   2076                             if (expectedBreaks[endContext-1]) {
   2077                                 if (count == 0) break;
   2078                                 count --;
   2079                             }
   2080                             endContext ++;
   2081                         }
   2082                     }
   2083 
   2084                     // Format looks like   "<data><>\uabcd\uabcd<>\U0001abcd...</data>"
   2085                     StringBuffer errorText = new StringBuffer();
   2086 
   2087                     int      c;    // Char from test data
   2088                     for (ci = startContext;  ci <= endContext && ci != -1;  ci = nextCP(testText, ci)) {
   2089                         if (ci == i) {
   2090                             // This is the location of the error.
   2091                             errorText.append("<?>---------------------------------\n");
   2092                         } else if (expectedBreaks[ci]) {
   2093                             // This a non-error expected break position.
   2094                             errorText.append("------------------------------------\n");
   2095                         }
   2096                         if (ci < testText.length()) {
   2097                             c = UTF16.charAt(testText, ci);
   2098                             appendCharToBuf(errorText, c, 11);
   2099                             String gc = UCharacter.getPropertyValueName(UProperty.GENERAL_CATEGORY, UCharacter.getType(c), UProperty.NameChoice.SHORT);
   2100                             appendToBuf(errorText, gc, 8);
   2101                             int extraProp = UCharacter.getIntPropertyValue(c, mk.fCharProperty);
   2102                             String extraPropValue =
   2103                                     UCharacter.getPropertyValueName(mk.fCharProperty, extraProp, UProperty.NameChoice.LONG);
   2104                             appendToBuf(errorText, extraPropValue, 20);
   2105 
   2106                             String charName = UCharacter.getExtendedName(c);
   2107                             appendToBuf(errorText, charName, 40);
   2108                             errorText.append('\n');
   2109                         }
   2110                     }
   2111                     if (ci == testText.length() && ci != -1) {
   2112                         errorText.append("<>");
   2113                     }
   2114                     errorText.append("</data>\n");
   2115 
   2116                     // Output the error
   2117                     errln(name + " break monkey test error.  " +
   2118                             (expectedBreaks[i]? "Break expected but not found." : "Break found but not expected.") +
   2119                             "\nOperation = " + errorType + "; random seed = " + seed + ";  buf Idx = " + i + "\n" +
   2120                             errorText);
   2121                     break;
   2122                 }
   2123             }
   2124 
   2125             loopCount++;
   2126         }
   2127     }
   2128 
   2129     @Test
   2130     public void TestCharMonkey() {
   2131 
   2132         int        loopCount = 500;
   2133         int        seed      = 1;
   2134 
   2135         if (TestFmwk.getExhaustiveness() >= 9) {
   2136             loopCount = 10000;
   2137         }
   2138 
   2139         RBBICharMonkey  m = new RBBICharMonkey();
   2140         BreakIterator   bi = BreakIterator.getCharacterInstance(Locale.US);
   2141         RunMonkey(bi, m, "char", seed, loopCount);
   2142     }
   2143 
   2144     @Test
   2145     public void TestWordMonkey() {
   2146 
   2147         int        loopCount = 500;
   2148         int        seed      = 1;
   2149 
   2150         if (TestFmwk.getExhaustiveness() >= 9) {
   2151             loopCount = 10000;
   2152         }
   2153 
   2154         logln("Word Break Monkey Test");
   2155         RBBIWordMonkey  m = new RBBIWordMonkey();
   2156         BreakIterator   bi = BreakIterator.getWordInstance(Locale.US);
   2157         RunMonkey(bi, m, "word", seed, loopCount);
   2158     }
   2159 
   2160     @Test
   2161     public void TestLineMonkey() {
   2162         int        loopCount = 500;
   2163         int        seed      = 1;
   2164 
   2165         if (TestFmwk.getExhaustiveness() >= 9) {
   2166             loopCount = 10000;
   2167         }
   2168 
   2169         logln("Line Break Monkey Test");
   2170         RBBILineMonkey  m = new RBBILineMonkey();
   2171         BreakIterator   bi = BreakIterator.getLineInstance(Locale.US);
   2172         RunMonkey(bi, m, "line", seed, loopCount);
   2173     }
   2174 
   2175     @Test
   2176     public void TestSentMonkey() {
   2177 
   2178         int        loopCount = 500;
   2179         int        seed      = 1;
   2180 
   2181         if (TestFmwk.getExhaustiveness() >= 9) {
   2182             loopCount = 3000;
   2183         }
   2184 
   2185         logln("Sentence Break Monkey Test");
   2186         RBBISentenceMonkey  m = new RBBISentenceMonkey();
   2187         BreakIterator   bi = BreakIterator.getSentenceInstance(Locale.US);
   2188         RunMonkey(bi, m, "sent", seed, loopCount);
   2189     }
   2190     //
   2191     //  Round-trip monkey tests.
   2192     //  Verify that break iterators created from the rule source from the default
   2193     //    break iterators still pass the monkey test for the iterator type.
   2194     //
   2195     //  This is a major test for the Rule Compiler.  The default break iterators are built
   2196     //  from pre-compiled binary rule data that was created using ICU4C; these
   2197     //  round-trip rule recompile tests verify that the Java rule compiler can
   2198     //  rebuild break iterators from the original source rules.
   2199     //
   2200     @Test
   2201     public void TestRTCharMonkey() {
   2202 
   2203         int        loopCount = 200;
   2204         int        seed      = 1;
   2205 
   2206         if (TestFmwk.getExhaustiveness() >= 9) {
   2207             loopCount = 2000;
   2208         }
   2209 
   2210         RBBICharMonkey  m = new RBBICharMonkey();
   2211         BreakIterator   bi = BreakIterator.getCharacterInstance(Locale.US);
   2212         String rules = bi.toString();
   2213         BreakIterator rtbi = new RuleBasedBreakIterator(rules);
   2214         RunMonkey(rtbi, m, "char", seed, loopCount);
   2215     }
   2216 
   2217     @Test
   2218     public void TestRTWordMonkey() {
   2219 
   2220         int        loopCount = 200;
   2221         int        seed      = 1;
   2222 
   2223         if (TestFmwk.getExhaustiveness() >= 9) {
   2224             loopCount = 2000;
   2225         }
   2226         logln("Word Break Monkey Test");
   2227         RBBIWordMonkey  m = new RBBIWordMonkey();
   2228         BreakIterator   bi = BreakIterator.getWordInstance(Locale.US);
   2229         String rules = bi.toString();
   2230         BreakIterator rtbi = new RuleBasedBreakIterator(rules);
   2231         RunMonkey(rtbi, m, "word", seed, loopCount);
   2232     }
   2233 
   2234     @Test
   2235     public void TestRTLineMonkey() {
   2236         int        loopCount = 200;
   2237         int        seed      = 1;
   2238 
   2239         if (TestFmwk.getExhaustiveness() >= 9) {
   2240             loopCount = 2000;
   2241         }
   2242 
   2243         logln("Line Break Monkey Test");
   2244         RBBILineMonkey  m = new RBBILineMonkey();
   2245         BreakIterator   bi = BreakIterator.getLineInstance(Locale.US);
   2246         String rules = bi.toString();
   2247         BreakIterator rtbi = new RuleBasedBreakIterator(rules);
   2248         RunMonkey(rtbi, m, "line", seed, loopCount);
   2249     }
   2250 
   2251     @Test
   2252     public void TestRTSentMonkey() {
   2253 
   2254         int        loopCount = 200;
   2255         int        seed      = 1;
   2256 
   2257         if (TestFmwk.getExhaustiveness() >= 9) {
   2258             loopCount = 1000;
   2259         }
   2260 
   2261         logln("Sentence Break Monkey Test");
   2262         RBBISentenceMonkey  m = new RBBISentenceMonkey();
   2263         BreakIterator   bi = BreakIterator.getSentenceInstance(Locale.US);
   2264         String rules = bi.toString();
   2265         BreakIterator rtbi = new RuleBasedBreakIterator(rules);
   2266         RunMonkey(rtbi, m, "sent", seed, loopCount);
   2267     }
   2268 }
   2269 
   2270