Home | History | Annotate | Download | only in rbbi
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 
      4 package com.ibm.icu.dev.test.rbbi;
      5 
      6 import java.io.IOException;
      7 import java.io.InputStream;
      8 import java.io.InputStreamReader;
      9 import java.util.ArrayList;
     10 import java.util.Arrays;
     11 import java.util.HashMap;
     12 import java.util.List;
     13 import java.util.Map;
     14 import java.util.regex.Matcher;
     15 import java.util.regex.Pattern;
     16 import java.util.regex.PatternSyntaxException;
     17 
     18 import org.junit.Test;
     19 import org.junit.runner.RunWith;
     20 import org.junit.runners.JUnit4;
     21 
     22 import com.ibm.icu.dev.test.TestFmwk;
     23 import com.ibm.icu.impl.UCharacterName;
     24 import com.ibm.icu.impl.UCharacterNameChoice;
     25 import com.ibm.icu.text.BreakIterator;
     26 import com.ibm.icu.text.RuleBasedBreakIterator;
     27 import com.ibm.icu.text.UnicodeSet;
     28 import com.ibm.icu.util.ULocale;
     29 
     30 /**
     31  * RBBI Monkey Test. Ported from ICU4C test/intltest/rbbimonkeytest.cpp.
     32  * This is the newer, data driven monkey test. It is completely separate from the
     33  * older class RBBITestMonkey.
     34  */
     35 
     36 @RunWith(JUnit4.class)
     37 public class RBBIMonkeyTest extends TestFmwk {
     38 
     39 
     40     //  class CharClass    Represents a single character class from the source break rules.
     41     //                     Inherits from UObject because instances are adopted by UHashtable, which ultimately
     42     //                     deletes them using hash's object deleter function.
     43 
     44     static class CharClass  {
     45         String         fName;
     46         String         fOriginalDef;    // set definition as it appeared in user supplied rules.
     47         String         fExpandedDef;    // set definition with any embedded named sets replaced by their defs, recursively.
     48         UnicodeSet     fSet;
     49         CharClass(String name, String originalDef, String expandedDef, UnicodeSet set) {
     50             fName = name;
     51             fOriginalDef = originalDef;
     52             fExpandedDef = expandedDef;
     53             fSet = set;
     54         };
     55     }
     56 
     57 
     58     // class BreakRule    Struct-like class represents a single rule from a set of break rules.
     59     //                    Each rule has the set definitions expanded, and
     60     //                    is compiled to a regular expression.
     61 
     62     static class BreakRule {
     63         String    fName;                   // Name of the rule.
     64         String    fRule;                   // Rule expression, excluding the name, as written in user source.
     65         String    fExpandedRule;           // Rule expression after expanding the set definitions.
     66         Matcher   fRuleMatcher;            // Regular expression that matches the rule.
     67     };
     68 
     69 
     70     // class BreakRules    represents a complete set of break rules, possibly tailored,
     71     //                     compiled from testdata break rules.
     72 
     73     static class BreakRules {
     74         BreakRules(RBBIMonkeyImpl monkeyImpl) {
     75             fMonkeyImpl = monkeyImpl;
     76             fBreakRules = new ArrayList<BreakRule>();
     77             fType = BreakIterator.KIND_TITLE;
     78             fCharClasses = new HashMap<String, CharClass>();
     79             fCharClassList = new ArrayList<CharClass>();
     80             fDictionarySet = new UnicodeSet();
     81 
     82             // Match an alpha-numeric identifier in a rule. Will be a set name.
     83             // Use negative look-behind to exclude non-identifiers, mostly property names or values.
     84             fSetRefsMatcher = Pattern.compile(
     85                     "(?<!\\{[ \\t]{0,4})" +
     86                     "(?<!=[ \\t]{0,4})" +
     87                     "(?<!\\[:[ \\t]{0,4})" +
     88                     "(?<!\\\\)" +
     89                     "(?<![A-Za-z0-9_])" +
     90                     "([A-Za-z_][A-Za-z0-9_]*)").     // The char class name
     91                     matcher("");
     92 
     93             // Match comments and blank lines. Matches will be replaced with "", stripping the comments from the rules.
     94             fCommentsMatcher = Pattern.compile("" +
     95                     "(^|(?<=;))"   +                // Start either at start of line, or just after a ';' (look-behind for ';')
     96                     "[ \\t]*+"     +                //   Match white space.
     97                     "(#.*)?+"      +                //   Optional # plus whatever follows
     98                     "$").                           //   new-line at end of line.
     99                     matcher("");
    100 
    101             // Match (initial parse) of a character class definition line.
    102             fClassDefMatcher = Pattern.compile("" +
    103                     "[ \\t]*"           +                    // leading white space
    104                     "([A-Za-z_][A-Za-z0-9_]*)" +             // The char class name
    105                     "[ \\t]*=[ \\t]*"   +                    //   =
    106                     "(.*?)"  +                               // The char class UnicodeSet expression
    107                     "[ \\t]*;$").                            // ; <end of line>
    108                     matcher("");
    109 
    110             // Match (initial parse) of a break rule line.
    111             fRuleDefMatcher = Pattern.compile("" +
    112                     "[ \\t]*"           +                     // leading white space
    113                     "([A-Za-z_][A-Za-z0-9_.]*)" +             // The rule name
    114                     "[ \\t]*:[ \\t]*"   +                     //   :
    115                     "(.*?)"   +                               // The rule definition
    116                     "[ \\t]*;$").                             // ; <end of line>
    117                     matcher("");
    118 
    119             // Match a property expression, either [:xxx:] or \p{...}
    120             fPropertyMatcher = Pattern.compile("" +
    121                     "\\[:.*?:]|\\\\(?:p|P)\\{.*?\\}").
    122                     matcher("");
    123 
    124 
    125         }
    126 
    127         /**
    128          * Create the expanded definition for this char class,
    129          * replacing any set references with the corresponding definition.
    130          */
    131         CharClass  addCharClass(String name, String definition) {
    132             StringBuffer expandedDef = new StringBuffer();
    133             fSetRefsMatcher.reset(definition);
    134             while (fSetRefsMatcher.find()) {
    135                 String sname = fSetRefsMatcher.group(/*"ClassName"*/ 1);
    136                 CharClass snameClass = fCharClasses.get(sname);
    137                 String expansionForName = snameClass != null ? snameClass.fExpandedDef : sname;
    138 
    139                 fSetRefsMatcher.appendReplacement(expandedDef, "");
    140                 expandedDef.append(expansionForName);
    141             }
    142             fSetRefsMatcher.appendTail(expandedDef);
    143             String expandedDefString = expandedDef.toString();
    144 
    145             if (fMonkeyImpl.fDumpExpansions) {
    146                 System.out.printf("addCharClass(\"%s\"\n", name);
    147                 System.out.printf("             %s\n", definition);
    148                 System.out.printf("expandedDef: %s\n", expandedDefString);
    149             }
    150 
    151             // Verify that the expanded set definition is valid.
    152 
    153             UnicodeSet s;
    154             try {
    155                 s = new UnicodeSet(expandedDefString, UnicodeSet.IGNORE_SPACE);
    156             } catch (java.lang.IllegalArgumentException e) {
    157                 System.err.printf("%s: error %s creating UnicodeSet %s", fMonkeyImpl.fRuleFileName, e.toString(), name);
    158                 throw e;
    159             }
    160 
    161             // Get an expanded equivalent pattern from the UnicodeSet.
    162             // This removes set difference operators, which would fail if passed through to Java regex.
    163 
    164             StringBuffer expandedPattern = new StringBuffer();
    165             s._generatePattern(expandedPattern, true);
    166             expandedDefString = expandedPattern.toString();
    167             if (fMonkeyImpl.fDumpExpansions) {
    168                 System.out.printf("expandedDef2: %s\n", expandedDefString);
    169             }
    170 
    171             CharClass cclass = new CharClass(name, definition, expandedDefString, s);
    172             CharClass previousClass = fCharClasses.put(name, cclass);
    173 
    174             if (previousClass != null) {
    175                 // TODO: decide whether or not to allow redefinitions.
    176                 //       Can be convenient in some cases.
    177                 // String msg = String.format("%s: Redefinition of character class %s\n",
    178                 //         fMonkeyImpl.fRuleFileName, cclass.fName);
    179                 // System.err.println(msg);
    180                 // throw new IllegalArgumentException(msg);
    181             }
    182             return cclass;
    183 
    184         };
    185 
    186 
    187         void addRule(String  name, String  definition) {
    188             BreakRule  thisRule = new BreakRule();
    189             StringBuffer expandedDefsRule = new StringBuffer();
    190             thisRule.fName = name;
    191             thisRule.fRule = definition;
    192 
    193             // Expand the char class definitions within the rule.
    194             fSetRefsMatcher.reset(definition);
    195             while (fSetRefsMatcher.find()) {
    196                 String sname = fSetRefsMatcher.group(/*"ClassName"*/ 1);
    197                 CharClass nameClass = fCharClasses.get(sname);
    198                 if (nameClass == null) {
    199                     System.err.printf("char class \"%s\" unrecognized in rule \"%s\"\n", sname, definition);
    200                 }
    201                 String expansionForName = nameClass != null ? nameClass.fExpandedDef : sname;
    202                 fSetRefsMatcher.appendReplacement(expandedDefsRule, "");
    203                 expandedDefsRule.append(expansionForName);
    204             }
    205             fSetRefsMatcher.appendTail(expandedDefsRule);
    206 
    207             // Replace any property expressions, \p{...} or [:...:] with an equivalent expansion,
    208             // obtained from ICU UnicodeSet. Need to do this substitution because Java regex
    209             // does not recognize all properties, and because Java's definitions are likely
    210             // older than ICU's.
    211 
    212             StringBuffer expandedRule = new StringBuffer();
    213             fPropertyMatcher.reset(expandedDefsRule);
    214             while (fPropertyMatcher.find()) {
    215                 String prop = fPropertyMatcher.group();
    216                 UnicodeSet propSet = new UnicodeSet("[" + prop + "]");
    217                 StringBuffer propExpansion = new StringBuffer();
    218                 propSet._generatePattern(propExpansion, true);
    219                 fPropertyMatcher.appendReplacement(expandedRule, propExpansion.toString());
    220             }
    221             fPropertyMatcher.appendTail(expandedRule);
    222 
    223             //   Replace any [^negated sets] with equivalent flattened sets generated by
    224             //   ICU UnicodeSet. [^ ...] in Java Regex character classes does not apply
    225             //   to any nested classes. Variable substitution in rules produces
    226             //   nested sets that [^negation] needs to apply to.
    227 
    228             StringBuffer ruleWithFlattenedSets = new StringBuffer();
    229             int idx = 0;
    230             while (idx<expandedRule.length()) {
    231                 int setOpenPos = expandedRule.indexOf("[^", idx);
    232                 if (setOpenPos < 0) {
    233                     break;
    234                 }
    235                 if (setOpenPos > idx) {
    236                     // Move anything from the source rule preceding the [^ into the processed rule, unchanged.
    237                     ruleWithFlattenedSets.append(expandedRule.substring(idx,  setOpenPos));
    238                 }
    239                 int nestingLevel = 1;
    240                 boolean haveNesting = false;
    241                 int setClosePos;
    242                 for (setClosePos = setOpenPos + 2; nestingLevel > 0 && setClosePos<expandedRule.length(); ++setClosePos) {
    243                     char c = expandedRule.charAt(setClosePos);
    244                     if (c == '\\') {
    245                         ++setClosePos;
    246                     } else if (c == '[') {
    247                         ++nestingLevel;
    248                         haveNesting = true;
    249                     } else if (c == ']') {
    250                         --nestingLevel;
    251                     }
    252                 }
    253                 if (haveNesting && nestingLevel == 0) {
    254                     // Found one, a negated set that includes interior nested sets.
    255                     // Create an ICU UnicodeSet from the source pattern, and obtain an
    256                     // equivalent flattened pattern from that.
    257                     UnicodeSet uset = new UnicodeSet(expandedRule.substring(setOpenPos, setClosePos), true);
    258                     uset._generatePattern(ruleWithFlattenedSets, true);
    259                 } else {
    260                     // The [^ set definition did not include any nested sets.
    261                     // Copy the original definition without change.
    262                     // Java regular expressions will handle it without needing to recast it.
    263                     if (nestingLevel > 0) {
    264                         // Error case of an unclosed character class expression.
    265                         // Java regex will also eventually flag the error.
    266                         System.err.printf("No closing ] found in rule %s\n", name);
    267                     }
    268                     ruleWithFlattenedSets.append(expandedRule.substring(setOpenPos, setClosePos));
    269                 }
    270                 idx = setClosePos;
    271             }
    272 
    273             if (idx < expandedRule.length()) {
    274                 ruleWithFlattenedSets.append(expandedRule.substring(idx, expandedRule.length()));
    275             }
    276 
    277             thisRule.fExpandedRule = ruleWithFlattenedSets.toString();
    278 
    279             // Replace the divide sign (\u00f7) with a regular expression named capture.
    280             // When running the rules, a match that includes this group means we found a break position.
    281 
    282             // thisRule.fExpandedRule = thisRule.fExpandedRule.replace("", "(?<BreakPosition>)");
    283             thisRule.fExpandedRule = thisRule.fExpandedRule.replace("", "()");
    284             if (thisRule.fExpandedRule.indexOf("") != -1) {
    285                 String msg = String.format("%s Rule %s contains multiple  signs", fMonkeyImpl.fRuleFileName, name);
    286                 System.err.println(msg);
    287                 throw new IllegalArgumentException(msg);
    288             }
    289 
    290             // UAX break rule set definitions can be empty, just [].
    291             // Regular expression set expressions don't accept this. Substitute with [a&&[^a]], which
    292             // also matches nothing.
    293 
    294             thisRule.fExpandedRule = thisRule.fExpandedRule.replace("[]", "[a&&[^a]]");
    295 
    296             // Change Unicode escape syntax for compatibility with Java regular expressions (Java 7 or newer)
    297             //    \udddd     => \x{dddd}
    298             //    \U00hhhhhh => \x{hhhhhh}
    299 
    300             // thisRule.fExpandedRule = thisRule.fExpandedRule.replaceAll("\\\\u([0-9A-Fa-f]{4})", "\\\\x{$1}");
    301             // thisRule.fExpandedRule = thisRule.fExpandedRule.replaceAll("\\\\U00([0-9A-Fa-f]{6})", "\\\\x{$1}");
    302 
    303             // Java 6 compatibility troubles - there is no syntax for escaping a supplementary character
    304             // within a regular expression character class. Put them in as unescaped literal chars.
    305             StringBuilder sb = new StringBuilder(thisRule.fExpandedRule);
    306             while (true) {
    307                 int where = sb.indexOf("\\U00");
    308                 if (where < 0) {
    309                     break;
    310                 }
    311                 String cp = hexToCodePoint(sb.substring(where+2, where+10));
    312                 sb.replace(where, where+10, cp);
    313             }
    314             thisRule.fExpandedRule = sb.toString();
    315 
    316             // Escape any literal '#' in the rule expression. Without escaping, these introduce a comment.
    317             // UnicodeSet._generatePattern() inserts un-escaped "#"s
    318 
    319             thisRule.fExpandedRule = thisRule.fExpandedRule.replace("#", "\\#");
    320             if (fMonkeyImpl.fDumpExpansions) {
    321                 System.out.printf("fExpandedRule: %s\n", thisRule.fExpandedRule);
    322             }
    323 
    324             // Compile a regular expression for this rule.
    325 
    326             try {
    327                 thisRule.fRuleMatcher = Pattern.compile(thisRule.fExpandedRule, Pattern.COMMENTS | Pattern.DOTALL).matcher("");
    328             } catch (PatternSyntaxException e) {
    329                 System.err.printf("%s: Error creating regular expression for rule %s. Expansion is \n\"%s\"",
    330                         fMonkeyImpl.fRuleFileName, name, thisRule.fExpandedRule);
    331                 throw e;
    332             }
    333 
    334             // Put this new rule into the vector of all Rules.
    335 
    336             fBreakRules.add(thisRule);
    337         };
    338 
    339         private static String hexToCodePoint(String hex) {
    340             int cp = Integer.parseInt(hex, 16);
    341             return new StringBuilder().appendCodePoint(cp).toString();
    342         }
    343 
    344 
    345         boolean setKeywordParameter(String keyword, String value) {
    346             if (keyword.equals("locale")) {
    347                 fLocale = new ULocale(value);
    348                 return true;
    349             }
    350             if (keyword.equals("type")) {
    351                 if (value.equals("grapheme")) {
    352                     fType = BreakIterator.KIND_CHARACTER;
    353                 } else if (value.equals("word")) {
    354                     fType = BreakIterator.KIND_WORD;
    355                 } else if (value.equals("line")) {
    356                     fType = BreakIterator.KIND_LINE;
    357                 } else if (value.equals("sentence")) {
    358                     fType = BreakIterator.KIND_SENTENCE;
    359                 } else {
    360                     String msg = String.format("%s: Unrecognized break type %s", fMonkeyImpl.fRuleFileName, value);
    361                     System.err.println(msg);
    362                     throw new IllegalArgumentException(msg);
    363                 }
    364                 return true;
    365             }
    366             return false;
    367         }
    368 
    369 
    370         RuleBasedBreakIterator createICUBreakIterator() {
    371             BreakIterator bi;
    372             switch(fType) {
    373                 case BreakIterator.KIND_CHARACTER:
    374                     bi = (BreakIterator.getCharacterInstance(fLocale));
    375                     break;
    376                 case BreakIterator.KIND_WORD:
    377                     bi = (BreakIterator.getWordInstance(fLocale));
    378                     break;
    379                 case BreakIterator.KIND_LINE:
    380                     bi = (BreakIterator.getLineInstance(fLocale));
    381                     break;
    382                 case BreakIterator.KIND_SENTENCE:
    383                     bi = (BreakIterator.getSentenceInstance(fLocale));
    384                     break;
    385                 default:
    386                     String msg = String.format("%s: Bad break iterator type of %d", fMonkeyImpl.fRuleFileName, fType);
    387                     System.err.println(msg);
    388                     throw new IllegalArgumentException(msg);
    389             }
    390             return (RuleBasedBreakIterator)bi;
    391 
    392         };
    393 
    394 
    395 
    396         void compileRules(String rules) {
    397             int lineNumber = 0;
    398             for (String line: rules.split("\\r?\\n")) {
    399                 ++lineNumber;
    400                 // Strip comment lines.
    401                 fCommentsMatcher.reset(line);
    402                 line = fCommentsMatcher.replaceFirst("");
    403                 if (line.isEmpty()) {
    404                     continue;
    405                 }
    406 
    407                 // Recognize character class definition and keyword lines
    408                 fClassDefMatcher.reset(line);
    409                 if (fClassDefMatcher.matches()) {
    410                     String className = fClassDefMatcher.group(/*"ClassName"*/ 1);
    411                     String classDef  = fClassDefMatcher.group(/*"ClassDef"*/ 2);
    412                     if (fMonkeyImpl.fDumpExpansions) {
    413                         System.out.printf("scanned class: %s = %s\n", className, classDef);
    414                     }
    415                     if (setKeywordParameter(className, classDef)) {
    416                         // The scanned item was "type = ..." or "locale = ...", etc.
    417                         //   which are not actual character classes.
    418                         continue;
    419                     }
    420                     addCharClass(className, classDef);
    421                     continue;
    422                 }
    423 
    424                 // Recognize rule lines.
    425                 fRuleDefMatcher.reset(line);
    426                 if (fRuleDefMatcher.matches()) {
    427                     String ruleName = fRuleDefMatcher.group(/*"RuleName"*/ 1);
    428                     String ruleDef  = fRuleDefMatcher.group(/*"RuleDef"*/ 2);
    429                     if (fMonkeyImpl.fDumpExpansions) {
    430                         System.out.printf("scanned rule: %s : %s\n", ruleName, ruleDef);
    431                     }
    432                     addRule(ruleName, ruleDef);
    433                     continue;
    434                 }
    435 
    436                 String msg = String.format("Unrecognized line in rule file %s:%d \"%s\"",
    437                         fMonkeyImpl.fRuleFileName, lineNumber, line);
    438                 System.err.println(msg);
    439                 throw new IllegalArgumentException(msg);
    440             }
    441 
    442             // Build the vector of char classes, omitting the dictionary class if there is one.
    443             // This will be used when constructing the random text to be tested.
    444 
    445             // Also compute the "other" set, consisting of any characters not included in
    446             // one or more of the user defined sets.
    447 
    448             UnicodeSet otherSet = new UnicodeSet(0, 0x10ffff);
    449 
    450             for (Map.Entry<String, CharClass> el: fCharClasses.entrySet()) {
    451                 String ccName = el.getKey();
    452                 CharClass cclass = el.getValue();
    453 
    454                 // System.out.printf("    Adding %s\n", ccName);
    455                 if (!ccName.equals(cclass.fName)) {
    456                     throw new IllegalArgumentException(
    457                             String.format("%s: internal error, set names (%s, %s) inconsistent.\n",
    458                                     fMonkeyImpl.fRuleFileName, ccName, cclass.fName));
    459                 }
    460                 otherSet.removeAll(cclass.fSet);
    461                 if (ccName.equals("dictionary")) {
    462                     fDictionarySet = cclass.fSet;
    463                 } else {
    464                     fCharClassList.add(cclass);
    465                 }
    466             }
    467 
    468             if (!otherSet.isEmpty()) {
    469                 // System.out.printf("have an other set.\n");
    470                 CharClass cclass = addCharClass("__Others", otherSet.toPattern(true));
    471                 fCharClassList.add(cclass);
    472             }
    473 
    474         };
    475 
    476         CharClass getClassForChar(int c) {
    477             for (CharClass cc: fCharClassList) {
    478                 if (cc.fSet.contains(c)) {
    479                     return cc;
    480                 }
    481             }
    482             return null;
    483         };
    484 
    485 
    486         RBBIMonkeyImpl          fMonkeyImpl;        // Pointer back to the owning MonkeyImpl instance.
    487         List<BreakRule>         fBreakRules;        // Contents are of type (BreakRule *).
    488 
    489         Map<String, CharClass>  fCharClasses;       // Key is the set name.
    490         //                                          // Value is the corresponding CharClass
    491         List<CharClass>         fCharClassList;     // Char Classes, same contents as fCharClasses values,
    492 
    493         UnicodeSet              fDictionarySet;     // Dictionary set, empty if none is defined.
    494         ULocale                 fLocale;
    495         int                     fType;              // BreakItererator.KIND_WORD, etc.
    496 
    497 
    498         Matcher fSetRefsMatcher;
    499         Matcher fCommentsMatcher;
    500         Matcher fClassDefMatcher;
    501         Matcher fRuleDefMatcher;
    502         Matcher fPropertyMatcher;
    503     };
    504 
    505 
    506 
    507 
    508     // class MonkeyTestData    represents a randomly synthesized test data string together
    509     //                         with the expected break positions obtained by applying
    510     //                         the test break rules.
    511 
    512     static class MonkeyTestData{
    513 
    514         void set(BreakRules rules, ICU_Rand rand) {
    515             int dataLength = 1000;   // length of test data to generate, in code points.
    516 
    517             // Fill the test string with random characters.
    518             // First randomly pick a char class, then randomly pick a character from that class.
    519             // Exclude any characters from the dictionary set.
    520 
    521             // System.out.println("Populating Test Data");
    522             fRandomSeed = rand.getSeed();         // Save initial seed for use in error messages,
    523                                                   // allowing recreation of failing data.
    524             fBkRules = rules;
    525             StringBuilder newString = new StringBuilder();
    526             for (int n=0; n<dataLength;) {
    527                 int charClassIndex = rand.next() % rules.fCharClassList.size();
    528                 CharClass cclass = rules.fCharClassList.get(charClassIndex);
    529                 if (cclass.fSet.size() == 0) {
    530                     // Some rules or tailorings do end up with empty char classes.
    531                     continue;
    532                 }
    533                 int charIndex = rand.next() % cclass.fSet.size();
    534                 int c = cclass.fSet.charAt(charIndex);
    535                 if (/*Character.isBmpCodePoint(c)*/ c<=0x0ffff && Character.isLowSurrogate((char)c) &&
    536                         newString.length() > 0 && Character.isHighSurrogate(newString.charAt(newString.length()-1))) {
    537                     // Character classes may contain unpaired surrogates, e.g. Grapheme_Cluster_Break = Control.
    538                     // Don't let random unpaired surrogates combine in the test data because they might
    539                     // produce an unwanted dictionary character.
    540                     continue;
    541                 }
    542 
    543                 if (!rules.fDictionarySet.contains(c)) {
    544                     newString.appendCodePoint(c);
    545                     ++n;
    546                 }
    547             }
    548             fString = newString.toString();
    549 
    550             // Init the expectedBreaks, actualBreaks and ruleForPosition.
    551             // Expected and Actual breaks are one longer than the input string; a true value
    552             // will indicate a boundary preceding that position.
    553 
    554             fActualBreaks    = new boolean[fString.length()+1];
    555             fExpectedBreaks  = new boolean[fString.length()+1];
    556             fRuleForPosition = new int[fString.length()+1];
    557             f2ndRuleForPos   = new int[fString.length()+1];
    558 
    559             // Apply reference rules to find the expected breaks.
    560 
    561             fExpectedBreaks[0] = true;       // Force an expected break before the start of the text.
    562                                              // ICU always reports a break there.
    563                                              // The reference rules do not have a means to do so.
    564             int strIdx = 0;
    565             while (strIdx < fString.length()) {
    566                 BreakRule matchingRule = null;
    567                 boolean hasBreak = false;
    568                 int ruleNum = 0;
    569                 int matchStart = 0;
    570                 int matchEnd = 0;
    571                 for (ruleNum=0; ruleNum<rules.fBreakRules.size(); ruleNum++) {
    572                     BreakRule rule = rules.fBreakRules.get(ruleNum);
    573                     rule.fRuleMatcher.reset(fString.substring(strIdx));
    574                     if (rule.fRuleMatcher.lookingAt()) {
    575                         // A candidate rule match, check further to see if we take it or continue to check other rules.
    576                         // Matches of zero or one code point count only if they also specify a break.
    577                         matchStart = strIdx;
    578                         matchEnd = strIdx + rule.fRuleMatcher.end();
    579                         hasBreak = BreakGroupStart(rule.fRuleMatcher) >= 0;
    580                         if (hasBreak ||
    581                                 (matchStart < fString.length() && fString.offsetByCodePoints(matchStart, 1) < matchEnd)) {
    582                             matchingRule = rule;
    583                             break;
    584                         }
    585                     }
    586                 }
    587                 if (matchingRule == null) {
    588                     // No reference rule matched. This is an error in the rules that should never happen.
    589                     String msg = String.format("%s: No reference rules matched at position %d. ",
    590                             rules.fMonkeyImpl.fRuleFileName, strIdx);
    591                     System.err.println(msg);
    592                     dump(strIdx);
    593                     throw new IllegalArgumentException(msg);
    594                 }
    595                 if (matchingRule.fRuleMatcher.group().length() == 0) {
    596                     // Zero length rule match. This is also an error in the rule expressions.
    597                     String msg = String.format("%s:%s: Zero length rule match at %d.",
    598                             rules.fMonkeyImpl.fRuleFileName, matchingRule.fName, strIdx);
    599                     System.err.println(msg);
    600                     dump(strIdx);
    601                     throw new IllegalArgumentException(msg);
    602                 }
    603 
    604                 // Record which rule matched over the length of the match.
    605                 for (int i = matchStart; i < matchEnd; i++) {
    606                     if (fRuleForPosition[i] == 0) {
    607                         fRuleForPosition[i] = ruleNum;
    608                     } else {
    609                         f2ndRuleForPos[i] = ruleNum;
    610                     }
    611                 }
    612 
    613                 // Break positions appear in rules as a matching named capture of zero length at the break position,
    614                 //   the adjusted pattern contains (?<BreakPosition>)
    615                 if (hasBreak) {
    616                     int breakPos = strIdx + BreakGroupStart(matchingRule.fRuleMatcher);
    617                     fExpectedBreaks[breakPos] = true;
    618                     // System.out.printf("recording break at %d\n", breakPos);
    619                     // For the next iteration, pick up applying rules immediately after the break,
    620                     // which may differ from end of the match. The matching rule may have included
    621                     // context following the boundary that needs to be looked at again.
    622                     strIdx = breakPos;
    623                 } else {
    624                     // Original rule didn't specify a break.
    625                     // Continue applying rules starting on the last code point of this match.
    626                     int updatedStrIdx = fString.offsetByCodePoints(matchEnd, -1);
    627                     if (updatedStrIdx == matchStart) {
    628                         // Match was only one code point, no progress if we continue.
    629                         // Shouldn't get here, case is filtered out at top of loop.
    630                         throw new IllegalArgumentException(String.format("%s: Rule %s internal error.",
    631                                 rules.fMonkeyImpl.fRuleFileName, matchingRule.fName));
    632                     }
    633                     strIdx = updatedStrIdx;
    634                 }
    635             }
    636         };
    637 
    638         // Helper function to find the starting index of a match of the "BreakPosition" named capture group.
    639         // @param m: a Java regex Matcher that has completed a matching operation.
    640         // @return m.start("BreakPosition),
    641         //         or -1 if there is no such group, or the group did not participate in the match.
    642         //
    643         // TODO: this becomes m.start("BreakPosition") with Java 8.
    644         //       In the mean time, assume that the only zero-length capturing group in
    645         //       a reference rule expression is the "BreakPosition" that corresponds to a "".
    646 
    647         static int BreakGroupStart(Matcher m) {
    648             for (int groupNum=1; groupNum <= m.groupCount(); ++groupNum) {
    649                 String group = m.group(groupNum);
    650                 if (group == null) {
    651                     continue;
    652                 }
    653                 if (group.equals("")) {
    654                     // assert(m.end(groupNum) == m.end("BreakPosition"));
    655                     return m.start(groupNum);
    656                 }
    657             }
    658             return -1;
    659         }
    660 
    661         void dump(int around) {
    662             System.out.print("\n"
    663                     +        "         char                        break  Rule                     Character\n"
    664                     +        "   pos   code   class                 R I   name                     name\n"
    665                     +        "---------------------------------------------------------------------------------------------\n");
    666 
    667             int start;
    668             int end;
    669 
    670             if (around == -1) {
    671                 start = 0;
    672                 end = fString.length();
    673             } else {
    674                 // Display context around a failure.
    675                 try {
    676                     start = fString.offsetByCodePoints(around, -30);
    677                 } catch (Exception e) {
    678                     start = 0;
    679                 }
    680                 try {
    681                     end = fString.offsetByCodePoints(around, +30);
    682                 } catch (Exception e) {
    683                     end = fString.length();
    684                 }
    685             }
    686 
    687             for (int charIdx = start; charIdx < end; charIdx=fString.offsetByCodePoints(charIdx, 1)) {
    688                 int c = fString.codePointAt(charIdx);
    689                 CharClass cc = fBkRules.getClassForChar(c);
    690 
    691                 BreakRule rule = fBkRules.fBreakRules.get(fRuleForPosition[charIdx]);
    692                 String secondRuleName = "";
    693                 if (f2ndRuleForPos[charIdx] > 0) {
    694                     secondRuleName = fBkRules.fBreakRules.get(f2ndRuleForPos[charIdx]).fName;
    695                 }
    696                 String cName = UCharacterName.INSTANCE.getName(c, UCharacterNameChoice.EXTENDED_CHAR_NAME);
    697 
    698                 System.out.printf("  %4d %6x   %-20s  %c %c   %-10s %-10s    %s\n",
    699                         charIdx, c, cc.fName,
    700                         fExpectedBreaks[charIdx] ? '*' : '.',
    701                         fActualBreaks[charIdx] ? '*' : '.',
    702                         rule.fName, secondRuleName, cName
    703                         );
    704                 }
    705 
    706         };
    707 
    708         void clearActualBreaks() {
    709             Arrays.fill(fActualBreaks, false);
    710         }
    711 
    712 
    713         int               fRandomSeed;        // The initial seed value from the random number generator.
    714         BreakRules        fBkRules;           // The break rules used to generate this data.
    715         String            fString;            // The text.
    716         boolean           fExpectedBreaks[];  // Breaks as found by the reference rules.
    717                                               //     Parallel to fString. true if break preceding.
    718         boolean           fActualBreaks[];    // Breaks as found by ICU break iterator.
    719         int               fRuleForPosition[]; // Index into BreakRules.fBreakRules of rule that applied at each position.
    720                                               // Also parallel to fString.
    721         int               f2ndRuleForPos[];   // As above. A 2nd rule applies when the preceding rule
    722                                               //   didn't cause a break, and a subsequent rule match starts
    723                                               //   on the last code point of the preceding match.
    724 
    725     }
    726 
    727 
    728     // class RBBIMonkeyImpl     holds (some indirectly) everything associated with running a monkey
    729     //                          test for one set of break rules.
    730     //
    731 
    732     static class RBBIMonkeyImpl extends Thread {
    733 
    734         void setup(String ruleFile) {
    735             fRuleFileName = ruleFile;
    736             openBreakRules(ruleFile);
    737             fRuleSet = new BreakRules(this);
    738             fRuleSet.compileRules(fRuleCharBuffer);
    739             fBI = fRuleSet.createICUBreakIterator();
    740             fTestData = new MonkeyTestData();
    741         };
    742 
    743         void openBreakRules(String fileName) {
    744             StringBuilder testFileBuf = new StringBuilder();
    745             InputStream is = null;
    746             String filePath = "break_rules/" + fileName;
    747             try {
    748                 is = RBBIMonkeyImpl.class.getResourceAsStream(filePath);
    749                 if (is == null) {
    750                     errln("Could not open test data file " + fileName);
    751                     return;
    752                 }
    753                 InputStreamReader isr = new InputStreamReader(is, "UTF-8");
    754                 try {
    755                     int c;
    756                     int count = 0;
    757                     for (;;) {
    758                         c = isr.read();
    759                         if (c < 0) {
    760                             break;
    761                         }
    762                         count++;
    763                         if (c == 0xFEFF && count == 1) {
    764                             // BOM in the test data file. Discard it.
    765                             continue;
    766                         }
    767                        testFileBuf.appendCodePoint(c);
    768                     }
    769                 } finally {
    770                     isr.close();
    771                 }
    772                 } catch (IOException e) {
    773                 try {
    774                     is.close();
    775                 } catch (IOException ignored) {
    776                 }
    777                 errln(e.toString());
    778             }
    779             fRuleCharBuffer =  testFileBuf.toString();  /* the file as a String */
    780         }
    781 
    782         class MonkeyException extends RuntimeException  {
    783             private static final long serialVersionUID = 1L;
    784             public int fPosition;    // Position of the failure in the test data.
    785             MonkeyException(String description, int pos) {
    786                 super(description);
    787                 fPosition = pos;
    788             }
    789         }
    790 
    791         @Override
    792         public void run() {
    793             int errorCount = 0;
    794             if (fBI == null) {
    795                 fErrorMsgs.append("Unable to run test because fBI is null.\n");
    796                 return;
    797             }
    798             for (long loopCount = 0; fLoopCount < 0 || loopCount < fLoopCount; loopCount++) {
    799                 try {
    800                     fTestData.set(fRuleSet, fRandomGenerator);
    801                     // fTestData.dump(-1);
    802                     testForwards();
    803                     testPrevious();
    804                     testFollowing();
    805                     testPreceding();
    806                     testIsBoundary();
    807                 } catch (MonkeyException e) {
    808                     String formattedMsg = String.format(
    809                             "%s at index %d. VM Arguments to reproduce: -Drules=%s -Dseed=%d -Dloop=1 -Dverbose=1 \"\n",
    810                             e.getMessage(), e.fPosition, fRuleFileName, fTestData.fRandomSeed);
    811                     System.err.print(formattedMsg);
    812                     if (fVerbose) {
    813                         fTestData.dump(e.fPosition);
    814                     }
    815                     fErrorMsgs.append(formattedMsg);
    816                     if (++errorCount > 10) {
    817                         return;
    818                     }
    819                 }
    820                 if (fLoopCount < 0 && loopCount % 100 == 0) {
    821                     System.err.print(".");
    822                 }
    823             }
    824         }
    825 
    826         enum CheckDirection {
    827             FORWARD,
    828             REVERSE
    829         };
    830 
    831         void testForwards() {
    832             fTestData.clearActualBreaks();
    833             fBI.setText(fTestData.fString);
    834             int previousBreak = -2;
    835             for (int bk=fBI.first(); bk != BreakIterator.DONE; bk=fBI.next()) {
    836                 if (bk <= previousBreak) {
    837                     throw new MonkeyException("Break Iterator Stall", bk);
    838                 }
    839                 if (bk < 0 || bk > fTestData.fString.length()) {
    840                     throw new MonkeyException("Boundary out of bounds", bk);
    841                 }
    842                 fTestData.fActualBreaks[bk] = true;
    843             }
    844             checkResults("testForwards", CheckDirection.FORWARD);
    845         };
    846 
    847 
    848        void testFollowing() {
    849            fTestData.clearActualBreaks();
    850            fBI.setText(fTestData.fString);
    851            int nextBreak = -1;
    852            for (int i=-1 ; i<fTestData.fString.length(); ++i) {
    853                int bk = fBI.following(i);
    854                if (bk == BreakIterator.DONE && i == fTestData.fString.length()) {
    855                    continue;
    856                }
    857                if (bk == nextBreak && bk > i) {
    858                    // i is in the gap between two breaks.
    859                    continue;
    860                }
    861                if (i == nextBreak && bk > nextBreak) {
    862                    fTestData.fActualBreaks[bk] = true;
    863                    nextBreak = bk;
    864                    continue;
    865                }
    866                throw new MonkeyException("following(i)", i);
    867            }
    868            checkResults("testFollowing", CheckDirection.FORWARD);
    869         };
    870 
    871 
    872         void testPrevious() {
    873             fTestData.clearActualBreaks();
    874             fBI.setText(fTestData.fString);
    875             int previousBreak = Integer.MAX_VALUE;
    876             for (int bk=fBI.last(); bk != BreakIterator.DONE; bk=fBI.previous()) {
    877                  if (bk >= previousBreak) {
    878                      throw new MonkeyException("Break Iterator Stall", bk);
    879                 }
    880                 if (bk < 0 || bk > fTestData.fString.length()) {
    881                     throw new MonkeyException("Boundary out of bounds", bk);
    882                 }
    883                 fTestData.fActualBreaks[bk] = true;
    884             }
    885             checkResults("testPrevius", CheckDirection.REVERSE);
    886         };
    887 
    888 
    889         /**
    890          * Given an index into a string, if it refers to the trail surrogate of a surrogate pair,
    891          * adjust it to point to the lead surrogate, which is the start of the code point.
    892          * @param s the String.
    893          * @param i the initial index
    894          * @return the adjusted index
    895          */
    896         private int getChar32Start(String s, int i) {
    897             if (i > 0 && i < s.length() &&
    898                     Character.isLowSurrogate(s.charAt(i)) && Character.isHighSurrogate(s.charAt(i-1))) {
    899                 --i;
    900             }
    901             return i;
    902         }
    903 
    904 
    905         void testPreceding() {
    906             fTestData.clearActualBreaks();
    907             fBI.setText(fTestData.fString);
    908             int nextBreak = fTestData.fString.length()+1;
    909             for (int i=fTestData.fString.length()+1 ; i>=0; --i) {
    910                 int bk = fBI.preceding(i);
    911                 // System.err.printf("testPreceding() i:%d  bk:%d  nextBreak:%d\n", i, bk, nextBreak);
    912                 if (bk == BreakIterator.DONE && i == 0) {
    913                     continue;
    914                 }
    915                 if (bk == nextBreak && bk < i) {
    916                     // i is in the gap between two breaks.
    917                     continue;
    918                 }
    919                 if (i<fTestData.fString.length() && getChar32Start(fTestData.fString, i) < i) {
    920                     // i indexes to a trailing surrogate.
    921                     // Break Iterators treat an index to either half as referring to the supplemental code point,
    922                     // with preceding going to some preceding code point.
    923                     if (fBI.preceding(i) != fBI.preceding(getChar32Start(fTestData.fString, i))) {
    924                         throw new MonkeyException("preceding of trailing surrogate error", i);
    925                     }
    926                     continue;
    927                 }
    928                 if (i == nextBreak && bk < nextBreak) {
    929                     fTestData.fActualBreaks[bk] = true;
    930                     nextBreak = bk;
    931                     continue;
    932                 }
    933                 throw new MonkeyException("preceding(i)", i);
    934             }
    935             checkResults("testPreceding", CheckDirection.REVERSE);
    936 
    937         };
    938 
    939 
    940         void testIsBoundary() {
    941             fTestData.clearActualBreaks();
    942             fBI.setText(fTestData.fString);
    943             for (int i=fTestData.fString.length(); i>=0; --i) {
    944                 if (fBI.isBoundary(i)) {
    945                     fTestData.fActualBreaks[i] = true;
    946                 }
    947             }
    948             checkResults("testForwards", CheckDirection.FORWARD);
    949         };
    950 
    951 
    952         void checkResults(String msg, CheckDirection direction) {
    953             if (direction == CheckDirection.FORWARD) {
    954                 for (int i=0; i<=fTestData.fString.length(); ++i) {
    955                     if (fTestData.fExpectedBreaks[i] != fTestData.fActualBreaks[i]) {
    956                         throw new MonkeyException(msg, i);
    957                     }
    958                 }
    959             } else {
    960                 for (int i=fTestData.fString.length(); i>=0; i--) {
    961                     if (fTestData.fExpectedBreaks[i] != fTestData.fActualBreaks[i]) {
    962                         throw new MonkeyException(msg, i);
    963                     }
    964                 }
    965             }
    966 
    967         };
    968 
    969         String                 fRuleCharBuffer;         // source file contents of the reference rules.
    970         BreakRules             fRuleSet;
    971         RuleBasedBreakIterator fBI;
    972         MonkeyTestData         fTestData;
    973         ICU_Rand               fRandomGenerator;
    974         String                 fRuleFileName;
    975         boolean                fVerbose;                 // True to do long dump of failing data.
    976         int                    fLoopCount;
    977         int                    fErrorCount;
    978 
    979         boolean                fDumpExpansions;          // Debug flag to output expanded form of rules and sets.
    980         StringBuilder          fErrorMsgs = new StringBuilder();
    981 
    982     }
    983 
    984     //  Test parameters, specified via Java properties.
    985     //
    986     //  rules=file_name   Name of file containing the reference rules.
    987     //  seed=nnnnn        Random number starting seed.
    988     //                    Setting the seed allows errors to be reproduced.
    989     //  loop=nnn          Looping count.  Controls running time.
    990     //                    -1:  run forever.
    991     //                     0 or greater:  run length.
    992     //  expansions        debug option, show expansions of rules and sets.
    993     //  verbose           Display details of the failure.
    994     //
    995     // Parameters are passed to the JVM on the command line, or
    996     // via the Eclipse Run Configuration settings, arguments tab, VM parameters.
    997     // For example,
    998     //      -ea -Drules=line.txt -Dloop=-1
    999     //
   1000     @Test
   1001     public void TestMonkey() {
   1002         String tests[] = {"grapheme.txt", "word.txt", "line.txt", "sentence.txt", "line_normal.txt",
   1003                 "line_normal_cj.txt", "line_loose.txt", "line_loose_cj.txt", "word_POSIX.txt"
   1004         };
   1005 
   1006         String testNameFromParams = getProperty("rules");
   1007 
   1008         if (testNameFromParams != null) {
   1009             tests = new String[] {testNameFromParams};
   1010         }
   1011 
   1012         int loopCount = getIntProperty("loop", isQuick() ? 100 : 5000);
   1013         boolean dumpExpansions =  getBooleanProperty("expansions", false);
   1014         boolean verbose = getBooleanProperty("verbose", false);
   1015         int seed = getIntProperty("seed", 1);
   1016 
   1017         List<RBBIMonkeyImpl> startedTests = new ArrayList<RBBIMonkeyImpl>();
   1018 
   1019         // Monkey testing is multi-threaded.
   1020         // Each set of break rules to be tested is run in a separate thread.
   1021         // Each thread/set of rules gets a separate RBBIMonkeyImpl object.
   1022 
   1023         for (String testName: tests) {
   1024             logln(String.format("beginning testing of %s", testName));
   1025 
   1026             RBBIMonkeyImpl test = new RBBIMonkeyImpl();
   1027 
   1028             test.fDumpExpansions = dumpExpansions;
   1029             test.fVerbose = verbose;
   1030             test.fRandomGenerator = new ICU_Rand(seed);
   1031             test.fLoopCount = loopCount;
   1032             test.setup(testName);
   1033 
   1034             test.start();
   1035             startedTests.add(test);
   1036         }
   1037 
   1038         StringBuilder errors = new StringBuilder();
   1039         for (RBBIMonkeyImpl test: startedTests) {
   1040             try {
   1041                 test.join();
   1042                 errors.append(test.fErrorMsgs);
   1043             } catch (InterruptedException e) {
   1044                 errors.append(e + "\n");
   1045             }
   1046         }
   1047         String errorMsgs = errors.toString();
   1048         assertEquals(errorMsgs, "", errorMsgs);
   1049 
   1050     }
   1051 
   1052 
   1053 }
   1054