Home | History | Annotate | Download | only in rbbi
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 
      5 package android.icu.dev.test.rbbi;
      6 
      7 import java.io.IOException;
      8 import java.io.InputStream;
      9 import java.io.InputStreamReader;
     10 import java.util.ArrayList;
     11 import java.util.Arrays;
     12 import java.util.HashMap;
     13 import java.util.List;
     14 import java.util.Map;
     15 import java.util.regex.Matcher;
     16 import java.util.regex.Pattern;
     17 import java.util.regex.PatternSyntaxException;
     18 
     19 import org.junit.Test;
     20 import org.junit.runner.RunWith;
     21 import org.junit.runners.JUnit4;
     22 
     23 import android.icu.dev.test.TestFmwk;
     24 import android.icu.impl.UCharacterName;
     25 import android.icu.impl.UCharacterNameChoice;
     26 import android.icu.text.BreakIterator;
     27 import android.icu.text.RuleBasedBreakIterator;
     28 import android.icu.text.UnicodeSet;
     29 import android.icu.util.ULocale;
     30 import android.icu.testsharding.MainTestShard;
     31 
     32 /**
     33  * RBBI Monkey Test. Ported from ICU4C test/intltest/rbbimonkeytest.cpp.
     34  * This is the newer, data driven monkey test. It is completely separate from the
     35  * older class RBBITestMonkey.
     36  */
     37 
     38 @MainTestShard
     39 @RunWith(JUnit4.class)
     40 public class RBBIMonkeyTest extends TestFmwk {
     41 
     42 
     43     //  class CharClass    Represents a single character class from the source break rules.
     44     //                     Inherits from UObject because instances are adopted by UHashtable, which ultimately
     45     //                     deletes them using hash's object deleter function.
     46 
     47     static class CharClass  {
     48         String         fName;
     49         String         fOriginalDef;    // set definition as it appeared in user supplied rules.
     50         String         fExpandedDef;    // set definition with any embedded named sets replaced by their defs, recursively.
     51         UnicodeSet     fSet;
     52         CharClass(String name, String originalDef, String expandedDef, UnicodeSet set) {
     53             fName = name;
     54             fOriginalDef = originalDef;
     55             fExpandedDef = expandedDef;
     56             fSet = set;
     57         };
     58     }
     59 
     60 
     61     // class BreakRule    Struct-like class represents a single rule from a set of break rules.
     62     //                    Each rule has the set definitions expanded, and
     63     //                    is compiled to a regular expression.
     64 
     65     static class BreakRule {
     66         String    fName;                   // Name of the rule.
     67         String    fRule;                   // Rule expression, excluding the name, as written in user source.
     68         String    fExpandedRule;           // Rule expression after expanding the set definitions.
     69         Matcher   fRuleMatcher;            // Regular expression that matches the rule.
     70     };
     71 
     72 
     73     // class BreakRules    represents a complete set of break rules, possibly tailored,
     74     //                     compiled from testdata break rules.
     75 
     76     static class BreakRules {
     77         BreakRules(RBBIMonkeyImpl monkeyImpl) {
     78             fMonkeyImpl = monkeyImpl;
     79             fBreakRules = new ArrayList<BreakRule>();
     80             fType = BreakIterator.KIND_TITLE;
     81             fCharClasses = new HashMap<String, CharClass>();
     82             fCharClassList = new ArrayList<CharClass>();
     83             fDictionarySet = new UnicodeSet();
     84 
     85             // Match an alpha-numeric identifier in a rule. Will be a set name.
     86             // Use negative look-behind to exclude non-identifiers, mostly property names or values.
     87             fSetRefsMatcher = Pattern.compile(
     88                     "(?<!\\{[ \\t]{0,4})" +
     89                     "(?<!=[ \\t]{0,4})" +
     90                     "(?<!\\[:[ \\t]{0,4})" +
     91                     "(?<!\\\\)" +
     92                     "(?<![A-Za-z0-9_])" +
     93                     "([A-Za-z_][A-Za-z0-9_]*)").     // The char class name
     94                     matcher("");
     95 
     96             // Match comments and blank lines. Matches will be replaced with "", stripping the comments from the rules.
     97             fCommentsMatcher = Pattern.compile("" +
     98                     "(^|(?<=;))"   +                // Start either at start of line, or just after a ';' (look-behind for ';')
     99                     "[ \\t]*+"     +                //   Match white space.
    100                     "(#.*)?+"      +                //   Optional # plus whatever follows
    101                     "$").                           //   new-line at end of line.
    102                     matcher("");
    103 
    104             // Match (initial parse) of a character class definition line.
    105             fClassDefMatcher = Pattern.compile("" +
    106                     "[ \\t]*"           +                    // leading white space
    107                     "([A-Za-z_][A-Za-z0-9_]*)" +             // The char class name
    108                     "[ \\t]*=[ \\t]*"   +                    //   =
    109                     "(.*?)"  +                               // The char class UnicodeSet expression
    110                     "[ \\t]*;$").                            // ; <end of line>
    111                     matcher("");
    112 
    113             // Match (initial parse) of a break rule line.
    114             fRuleDefMatcher = Pattern.compile("" +
    115                     "[ \\t]*"           +                     // leading white space
    116                     "([A-Za-z_][A-Za-z0-9_.]*)" +             // The rule name
    117                     "[ \\t]*:[ \\t]*"   +                     //   :
    118                     "(.*?)"   +                               // The rule definition
    119                     "[ \\t]*;$").                             // ; <end of line>
    120                     matcher("");
    121 
    122             // Match a property expression, either [:xxx:] or \p{...}
    123             fPropertyMatcher = Pattern.compile("" +
    124                     "\\[:.*?:]|\\\\(?:p|P)\\{.*?\\}").
    125                     matcher("");
    126 
    127 
    128         }
    129 
    130         /**
    131          * Create the expanded definition for this char class,
    132          * replacing any set references with the corresponding definition.
    133          */
    134         CharClass  addCharClass(String name, String definition) {
    135             StringBuffer expandedDef = new StringBuffer();
    136             fSetRefsMatcher.reset(definition);
    137             while (fSetRefsMatcher.find()) {
    138                 String sname = fSetRefsMatcher.group(/*"ClassName"*/ 1);
    139                 CharClass snameClass = fCharClasses.get(sname);
    140                 String expansionForName = snameClass != null ? snameClass.fExpandedDef : sname;
    141 
    142                 fSetRefsMatcher.appendReplacement(expandedDef, "");
    143                 expandedDef.append(expansionForName);
    144             }
    145             fSetRefsMatcher.appendTail(expandedDef);
    146             String expandedDefString = expandedDef.toString();
    147 
    148             if (fMonkeyImpl.fDumpExpansions) {
    149                 System.out.printf("addCharClass(\"%s\"\n", name);
    150                 System.out.printf("             %s\n", definition);
    151                 System.out.printf("expandedDef: %s\n", expandedDefString);
    152             }
    153 
    154             // Verify that the expanded set definition is valid.
    155 
    156             UnicodeSet s;
    157             try {
    158                 s = new UnicodeSet(expandedDefString, UnicodeSet.IGNORE_SPACE);
    159             } catch (java.lang.IllegalArgumentException e) {
    160                 System.err.printf("%s: error %s creating UnicodeSet %s", fMonkeyImpl.fRuleFileName, e.toString(), name);
    161                 throw e;
    162             }
    163 
    164             // Get an expanded equivalent pattern from the UnicodeSet.
    165             // This removes set difference operators, which would fail if passed through to Java regex.
    166 
    167             StringBuffer expandedPattern = new StringBuffer();
    168             s._generatePattern(expandedPattern, true);
    169             expandedDefString = expandedPattern.toString();
    170             if (fMonkeyImpl.fDumpExpansions) {
    171                 System.out.printf("expandedDef2: %s\n", expandedDefString);
    172             }
    173 
    174             CharClass cclass = new CharClass(name, definition, expandedDefString, s);
    175             CharClass previousClass = fCharClasses.put(name, cclass);
    176 
    177             if (previousClass != null) {
    178                 // TODO: decide whether or not to allow redefinitions.
    179                 //       Can be convenient in some cases.
    180                 // String msg = String.format("%s: Redefinition of character class %s\n",
    181                 //         fMonkeyImpl.fRuleFileName, cclass.fName);
    182                 // System.err.println(msg);
    183                 // throw new IllegalArgumentException(msg);
    184             }
    185             return cclass;
    186 
    187         };
    188 
    189 
    190         void addRule(String  name, String  definition) {
    191             BreakRule  thisRule = new BreakRule();
    192             StringBuffer expandedDefsRule = new StringBuffer();
    193             thisRule.fName = name;
    194             thisRule.fRule = definition;
    195 
    196             // Expand the char class definitions within the rule.
    197             fSetRefsMatcher.reset(definition);
    198             while (fSetRefsMatcher.find()) {
    199                 String sname = fSetRefsMatcher.group(/*"ClassName"*/ 1);
    200                 CharClass nameClass = fCharClasses.get(sname);
    201                 if (nameClass == null) {
    202                     System.err.printf("char class \"%s\" unrecognized in rule \"%s\"\n", sname, definition);
    203                 }
    204                 String expansionForName = nameClass != null ? nameClass.fExpandedDef : sname;
    205                 fSetRefsMatcher.appendReplacement(expandedDefsRule, "");
    206                 expandedDefsRule.append(expansionForName);
    207             }
    208             fSetRefsMatcher.appendTail(expandedDefsRule);
    209 
    210             // Replace any property expressions, \p{...} or [:...:] with an equivalent expansion,
    211             // obtained from ICU UnicodeSet. Need to do this substitution because Java regex
    212             // does not recognize all properties, and because Java's definitions are likely
    213             // older than ICU's.
    214 
    215             StringBuffer expandedRule = new StringBuffer();
    216             fPropertyMatcher.reset(expandedDefsRule);
    217             while (fPropertyMatcher.find()) {
    218                 String prop = fPropertyMatcher.group();
    219                 UnicodeSet propSet = new UnicodeSet("[" + prop + "]");
    220                 StringBuffer propExpansion = new StringBuffer();
    221                 propSet._generatePattern(propExpansion, true);
    222                 fPropertyMatcher.appendReplacement(expandedRule, propExpansion.toString());
    223             }
    224             fPropertyMatcher.appendTail(expandedRule);
    225 
    226             //   Replace any [^negated sets] with equivalent flattened sets generated by
    227             //   ICU UnicodeSet. [^ ...] in Java Regex character classes does not apply
    228             //   to any nested classes. Variable substitution in rules produces
    229             //   nested sets that [^negation] needs to apply to.
    230 
    231             StringBuffer ruleWithFlattenedSets = new StringBuffer();
    232             int idx = 0;
    233             while (idx<expandedRule.length()) {
    234                 int setOpenPos = expandedRule.indexOf("[^", idx);
    235                 if (setOpenPos < 0) {
    236                     break;
    237                 }
    238                 if (setOpenPos > idx) {
    239                     // Move anything from the source rule preceding the [^ into the processed rule, unchanged.
    240                     ruleWithFlattenedSets.append(expandedRule.substring(idx,  setOpenPos));
    241                 }
    242                 int nestingLevel = 1;
    243                 boolean haveNesting = false;
    244                 int setClosePos;
    245                 for (setClosePos = setOpenPos + 2; nestingLevel > 0 && setClosePos<expandedRule.length(); ++setClosePos) {
    246                     char c = expandedRule.charAt(setClosePos);
    247                     if (c == '\\') {
    248                         ++setClosePos;
    249                     } else if (c == '[') {
    250                         ++nestingLevel;
    251                         haveNesting = true;
    252                     } else if (c == ']') {
    253                         --nestingLevel;
    254                     }
    255                 }
    256                 if (haveNesting && nestingLevel == 0) {
    257                     // Found one, a negated set that includes interior nested sets.
    258                     // Create an ICU UnicodeSet from the source pattern, and obtain an
    259                     // equivalent flattened pattern from that.
    260                     UnicodeSet uset = new UnicodeSet(expandedRule.substring(setOpenPos, setClosePos), true);
    261                     uset._generatePattern(ruleWithFlattenedSets, true);
    262                 } else {
    263                     // The [^ set definition did not include any nested sets.
    264                     // Copy the original definition without change.
    265                     // Java regular expressions will handle it without needing to recast it.
    266                     if (nestingLevel > 0) {
    267                         // Error case of an unclosed character class expression.
    268                         // Java regex will also eventually flag the error.
    269                         System.err.printf("No closing ] found in rule %s\n", name);
    270                     }
    271                     ruleWithFlattenedSets.append(expandedRule.substring(setOpenPos, setClosePos));
    272                 }
    273                 idx = setClosePos;
    274             }
    275 
    276             if (idx < expandedRule.length()) {
    277                 ruleWithFlattenedSets.append(expandedRule.substring(idx, expandedRule.length()));
    278             }
    279 
    280             thisRule.fExpandedRule = ruleWithFlattenedSets.toString();
    281 
    282             // Replace the divide sign (\u00f7) with a regular expression named capture.
    283             // When running the rules, a match that includes this group means we found a break position.
    284 
    285             // thisRule.fExpandedRule = thisRule.fExpandedRule.replace("", "(?<BreakPosition>)");
    286             thisRule.fExpandedRule = thisRule.fExpandedRule.replace("", "()");
    287             if (thisRule.fExpandedRule.indexOf("") != -1) {
    288                 String msg = String.format("%s Rule %s contains multiple  signs", fMonkeyImpl.fRuleFileName, name);
    289                 System.err.println(msg);
    290                 throw new IllegalArgumentException(msg);
    291             }
    292 
    293             // UAX break rule set definitions can be empty, just [].
    294             // Regular expression set expressions don't accept this. Substitute with [a&&[^a]], which
    295             // also matches nothing.
    296 
    297             thisRule.fExpandedRule = thisRule.fExpandedRule.replace("[]", "[a&&[^a]]");
    298 
    299             // Change Unicode escape syntax for compatibility with Java regular expressions (Java 7 or newer)
    300             //    \udddd     => \x{dddd}
    301             //    \U00hhhhhh => \x{hhhhhh}
    302 
    303             // thisRule.fExpandedRule = thisRule.fExpandedRule.replaceAll("\\\\u([0-9A-Fa-f]{4})", "\\\\x{$1}");
    304             // thisRule.fExpandedRule = thisRule.fExpandedRule.replaceAll("\\\\U00([0-9A-Fa-f]{6})", "\\\\x{$1}");
    305 
    306             // Java 6 compatibility troubles - there is no syntax for escaping a supplementary character
    307             // within a regular expression character class. Put them in as unescaped literal chars.
    308             StringBuilder sb = new StringBuilder(thisRule.fExpandedRule);
    309             while (true) {
    310                 int where = sb.indexOf("\\U00");
    311                 if (where < 0) {
    312                     break;
    313                 }
    314                 String cp = hexToCodePoint(sb.substring(where+2, where+10));
    315                 sb.replace(where, where+10, cp);
    316             }
    317             thisRule.fExpandedRule = sb.toString();
    318 
    319             // Escape any literal '#' in the rule expression. Without escaping, these introduce a comment.
    320             // UnicodeSet._generatePattern() inserts un-escaped "#"s
    321 
    322             thisRule.fExpandedRule = thisRule.fExpandedRule.replace("#", "\\#");
    323             if (fMonkeyImpl.fDumpExpansions) {
    324                 System.out.printf("fExpandedRule: %s\n", thisRule.fExpandedRule);
    325             }
    326 
    327             // Compile a regular expression for this rule.
    328 
    329             try {
    330                 thisRule.fRuleMatcher = Pattern.compile(thisRule.fExpandedRule, Pattern.COMMENTS | Pattern.DOTALL).matcher("");
    331             } catch (PatternSyntaxException e) {
    332                 System.err.printf("%s: Error creating regular expression for rule %s. Expansion is \n\"%s\"",
    333                         fMonkeyImpl.fRuleFileName, name, thisRule.fExpandedRule);
    334                 throw e;
    335             }
    336 
    337             // Put this new rule into the vector of all Rules.
    338 
    339             fBreakRules.add(thisRule);
    340         };
    341 
    342         private static String hexToCodePoint(String hex) {
    343             int cp = Integer.parseInt(hex, 16);
    344             return new StringBuilder().appendCodePoint(cp).toString();
    345         }
    346 
    347 
    348         boolean setKeywordParameter(String keyword, String value) {
    349             if (keyword.equals("locale")) {
    350                 fLocale = new ULocale(value);
    351                 return true;
    352             }
    353             if (keyword.equals("type")) {
    354                 if (value.equals("grapheme")) {
    355                     fType = BreakIterator.KIND_CHARACTER;
    356                 } else if (value.equals("word")) {
    357                     fType = BreakIterator.KIND_WORD;
    358                 } else if (value.equals("line")) {
    359                     fType = BreakIterator.KIND_LINE;
    360                 } else if (value.equals("sentence")) {
    361                     fType = BreakIterator.KIND_SENTENCE;
    362                 } else {
    363                     String msg = String.format("%s: Unrecognized break type %s", fMonkeyImpl.fRuleFileName, value);
    364                     System.err.println(msg);
    365                     throw new IllegalArgumentException(msg);
    366                 }
    367                 return true;
    368             }
    369             return false;
    370         }
    371 
    372 
    373         RuleBasedBreakIterator createICUBreakIterator() {
    374             BreakIterator bi;
    375             switch(fType) {
    376                 case BreakIterator.KIND_CHARACTER:
    377                     bi = (BreakIterator.getCharacterInstance(fLocale));
    378                     break;
    379                 case BreakIterator.KIND_WORD:
    380                     bi = (BreakIterator.getWordInstance(fLocale));
    381                     break;
    382                 case BreakIterator.KIND_LINE:
    383                     bi = (BreakIterator.getLineInstance(fLocale));
    384                     break;
    385                 case BreakIterator.KIND_SENTENCE:
    386                     bi = (BreakIterator.getSentenceInstance(fLocale));
    387                     break;
    388                 default:
    389                     String msg = String.format("%s: Bad break iterator type of %d", fMonkeyImpl.fRuleFileName, fType);
    390                     System.err.println(msg);
    391                     throw new IllegalArgumentException(msg);
    392             }
    393             return (RuleBasedBreakIterator)bi;
    394 
    395         };
    396 
    397 
    398 
    399         void compileRules(String rules) {
    400             int lineNumber = 0;
    401             for (String line: rules.split("\\r?\\n")) {
    402                 ++lineNumber;
    403                 // Strip comment lines.
    404                 fCommentsMatcher.reset(line);
    405                 line = fCommentsMatcher.replaceFirst("");
    406                 if (line.isEmpty()) {
    407                     continue;
    408                 }
    409 
    410                 // Recognize character class definition and keyword lines
    411                 fClassDefMatcher.reset(line);
    412                 if (fClassDefMatcher.matches()) {
    413                     String className = fClassDefMatcher.group(/*"ClassName"*/ 1);
    414                     String classDef  = fClassDefMatcher.group(/*"ClassDef"*/ 2);
    415                     if (fMonkeyImpl.fDumpExpansions) {
    416                         System.out.printf("scanned class: %s = %s\n", className, classDef);
    417                     }
    418                     if (setKeywordParameter(className, classDef)) {
    419                         // The scanned item was "type = ..." or "locale = ...", etc.
    420                         //   which are not actual character classes.
    421                         continue;
    422                     }
    423                     addCharClass(className, classDef);
    424                     continue;
    425                 }
    426 
    427                 // Recognize rule lines.
    428                 fRuleDefMatcher.reset(line);
    429                 if (fRuleDefMatcher.matches()) {
    430                     String ruleName = fRuleDefMatcher.group(/*"RuleName"*/ 1);
    431                     String ruleDef  = fRuleDefMatcher.group(/*"RuleDef"*/ 2);
    432                     if (fMonkeyImpl.fDumpExpansions) {
    433                         System.out.printf("scanned rule: %s : %s\n", ruleName, ruleDef);
    434                     }
    435                     addRule(ruleName, ruleDef);
    436                     continue;
    437                 }
    438 
    439                 String msg = String.format("Unrecognized line in rule file %s:%d \"%s\"",
    440                         fMonkeyImpl.fRuleFileName, lineNumber, line);
    441                 System.err.println(msg);
    442                 throw new IllegalArgumentException(msg);
    443             }
    444 
    445             // Build the vector of char classes, omitting the dictionary class if there is one.
    446             // This will be used when constructing the random text to be tested.
    447 
    448             // Also compute the "other" set, consisting of any characters not included in
    449             // one or more of the user defined sets.
    450 
    451             UnicodeSet otherSet = new UnicodeSet(0, 0x10ffff);
    452 
    453             for (Map.Entry<String, CharClass> el: fCharClasses.entrySet()) {
    454                 String ccName = el.getKey();
    455                 CharClass cclass = el.getValue();
    456 
    457                 // System.out.printf("    Adding %s\n", ccName);
    458                 if (!ccName.equals(cclass.fName)) {
    459                     throw new IllegalArgumentException(
    460                             String.format("%s: internal error, set names (%s, %s) inconsistent.\n",
    461                                     fMonkeyImpl.fRuleFileName, ccName, cclass.fName));
    462                 }
    463                 otherSet.removeAll(cclass.fSet);
    464                 if (ccName.equals("dictionary")) {
    465                     fDictionarySet = cclass.fSet;
    466                 } else {
    467                     fCharClassList.add(cclass);
    468                 }
    469             }
    470 
    471             if (!otherSet.isEmpty()) {
    472                 // System.out.printf("have an other set.\n");
    473                 CharClass cclass = addCharClass("__Others", otherSet.toPattern(true));
    474                 fCharClassList.add(cclass);
    475             }
    476 
    477         };
    478 
    479         CharClass getClassForChar(int c) {
    480             for (CharClass cc: fCharClassList) {
    481                 if (cc.fSet.contains(c)) {
    482                     return cc;
    483                 }
    484             }
    485             return null;
    486         };
    487 
    488 
    489         RBBIMonkeyImpl          fMonkeyImpl;        // Pointer back to the owning MonkeyImpl instance.
    490         List<BreakRule>         fBreakRules;        // Contents are of type (BreakRule *).
    491 
    492         Map<String, CharClass>  fCharClasses;       // Key is the set name.
    493         //                                          // Value is the corresponding CharClass
    494         List<CharClass>         fCharClassList;     // Char Classes, same contents as fCharClasses values,
    495 
    496         UnicodeSet              fDictionarySet;     // Dictionary set, empty if none is defined.
    497         ULocale                 fLocale;
    498         int                     fType;              // BreakItererator.KIND_WORD, etc.
    499 
    500 
    501         Matcher fSetRefsMatcher;
    502         Matcher fCommentsMatcher;
    503         Matcher fClassDefMatcher;
    504         Matcher fRuleDefMatcher;
    505         Matcher fPropertyMatcher;
    506     };
    507 
    508 
    509 
    510 
    511     // class MonkeyTestData    represents a randomly synthesized test data string together
    512     //                         with the expected break positions obtained by applying
    513     //                         the test break rules.
    514 
    515     static class MonkeyTestData{
    516 
    517         void set(BreakRules rules, ICU_Rand rand) {
    518             int dataLength = 1000;   // length of test data to generate, in code points.
    519 
    520             // Fill the test string with random characters.
    521             // First randomly pick a char class, then randomly pick a character from that class.
    522             // Exclude any characters from the dictionary set.
    523 
    524             // System.out.println("Populating Test Data");
    525             fRandomSeed = rand.getSeed();         // Save initial seed for use in error messages,
    526                                                   // allowing recreation of failing data.
    527             fBkRules = rules;
    528             StringBuilder newString = new StringBuilder();
    529             for (int n=0; n<dataLength;) {
    530                 int charClassIndex = rand.next() % rules.fCharClassList.size();
    531                 CharClass cclass = rules.fCharClassList.get(charClassIndex);
    532                 if (cclass.fSet.size() == 0) {
    533                     // Some rules or tailorings do end up with empty char classes.
    534                     continue;
    535                 }
    536                 int charIndex = rand.next() % cclass.fSet.size();
    537                 int c = cclass.fSet.charAt(charIndex);
    538                 if (/*Character.isBmpCodePoint(c)*/ c<=0x0ffff && Character.isLowSurrogate((char)c) &&
    539                         newString.length() > 0 && Character.isHighSurrogate(newString.charAt(newString.length()-1))) {
    540                     // Character classes may contain unpaired surrogates, e.g. Grapheme_Cluster_Break = Control.
    541                     // Don't let random unpaired surrogates combine in the test data because they might
    542                     // produce an unwanted dictionary character.
    543                     continue;
    544                 }
    545 
    546                 if (!rules.fDictionarySet.contains(c)) {
    547                     newString.appendCodePoint(c);
    548                     ++n;
    549                 }
    550             }
    551             fString = newString.toString();
    552 
    553             // Init the expectedBreaks, actualBreaks and ruleForPosition.
    554             // Expected and Actual breaks are one longer than the input string; a true value
    555             // will indicate a boundary preceding that position.
    556 
    557             fActualBreaks    = new boolean[fString.length()+1];
    558             fExpectedBreaks  = new boolean[fString.length()+1];
    559             fRuleForPosition = new int[fString.length()+1];
    560             f2ndRuleForPos   = new int[fString.length()+1];
    561 
    562             // Apply reference rules to find the expected breaks.
    563 
    564             fExpectedBreaks[0] = true;       // Force an expected break before the start of the text.
    565                                              // ICU always reports a break there.
    566                                              // The reference rules do not have a means to do so.
    567             int strIdx = 0;
    568             while (strIdx < fString.length()) {
    569                 BreakRule matchingRule = null;
    570                 boolean hasBreak = false;
    571                 int ruleNum = 0;
    572                 int matchStart = 0;
    573                 int matchEnd = 0;
    574                 for (ruleNum=0; ruleNum<rules.fBreakRules.size(); ruleNum++) {
    575                     BreakRule rule = rules.fBreakRules.get(ruleNum);
    576                     rule.fRuleMatcher.reset(fString.substring(strIdx));
    577                     if (rule.fRuleMatcher.lookingAt()) {
    578                         // A candidate rule match, check further to see if we take it or continue to check other rules.
    579                         // Matches of zero or one code point count only if they also specify a break.
    580                         matchStart = strIdx;
    581                         matchEnd = strIdx + rule.fRuleMatcher.end();
    582                         hasBreak = BreakGroupStart(rule.fRuleMatcher) >= 0;
    583                         if (hasBreak ||
    584                                 (matchStart < fString.length() && fString.offsetByCodePoints(matchStart, 1) < matchEnd)) {
    585                             matchingRule = rule;
    586                             break;
    587                         }
    588                     }
    589                 }
    590                 if (matchingRule == null) {
    591                     // No reference rule matched. This is an error in the rules that should never happen.
    592                     String msg = String.format("%s: No reference rules matched at position %d. ",
    593                             rules.fMonkeyImpl.fRuleFileName, strIdx);
    594                     System.err.println(msg);
    595                     dump(strIdx);
    596                     throw new IllegalArgumentException(msg);
    597                 }
    598                 if (matchingRule.fRuleMatcher.group().length() == 0) {
    599                     // Zero length rule match. This is also an error in the rule expressions.
    600                     String msg = String.format("%s:%s: Zero length rule match at %d.",
    601                             rules.fMonkeyImpl.fRuleFileName, matchingRule.fName, strIdx);
    602                     System.err.println(msg);
    603                     dump(strIdx);
    604                     throw new IllegalArgumentException(msg);
    605                 }
    606 
    607                 // Record which rule matched over the length of the match.
    608                 for (int i = matchStart; i < matchEnd; i++) {
    609                     if (fRuleForPosition[i] == 0) {
    610                         fRuleForPosition[i] = ruleNum;
    611                     } else {
    612                         f2ndRuleForPos[i] = ruleNum;
    613                     }
    614                 }
    615 
    616                 // Break positions appear in rules as a matching named capture of zero length at the break position,
    617                 //   the adjusted pattern contains (?<BreakPosition>)
    618                 if (hasBreak) {
    619                     int breakPos = strIdx + BreakGroupStart(matchingRule.fRuleMatcher);
    620                     fExpectedBreaks[breakPos] = true;
    621                     // System.out.printf("recording break at %d\n", breakPos);
    622                     // For the next iteration, pick up applying rules immediately after the break,
    623                     // which may differ from end of the match. The matching rule may have included
    624                     // context following the boundary that needs to be looked at again.
    625                     strIdx = breakPos;
    626                 } else {
    627                     // Original rule didn't specify a break.
    628                     // Continue applying rules starting on the last code point of this match.
    629                     int updatedStrIdx = fString.offsetByCodePoints(matchEnd, -1);
    630                     if (updatedStrIdx == matchStart) {
    631                         // Match was only one code point, no progress if we continue.
    632                         // Shouldn't get here, case is filtered out at top of loop.
    633                         throw new IllegalArgumentException(String.format("%s: Rule %s internal error.",
    634                                 rules.fMonkeyImpl.fRuleFileName, matchingRule.fName));
    635                     }
    636                     strIdx = updatedStrIdx;
    637                 }
    638             }
    639         };
    640 
    641         // Helper function to find the starting index of a match of the "BreakPosition" named capture group.
    642         // @param m: a Java regex Matcher that has completed a matching operation.
    643         // @return m.start("BreakPosition),
    644         //         or -1 if there is no such group, or the group did not participate in the match.
    645         //
    646         // TODO: this becomes m.start("BreakPosition") with Java 8.
    647         //       In the mean time, assume that the only zero-length capturing group in
    648         //       a reference rule expression is the "BreakPosition" that corresponds to a "".
    649 
    650         static int BreakGroupStart(Matcher m) {
    651             for (int groupNum=1; groupNum <= m.groupCount(); ++groupNum) {
    652                 String group = m.group(groupNum);
    653                 if (group == null) {
    654                     continue;
    655                 }
    656                 if (group.equals("")) {
    657                     // assert(m.end(groupNum) == m.end("BreakPosition"));
    658                     return m.start(groupNum);
    659                 }
    660             }
    661             return -1;
    662         }
    663 
    664         void dump(int around) {
    665             System.out.print("\n"
    666                     +        "         char                        break  Rule                     Character\n"
    667                     +        "   pos   code   class                 R I   name                     name\n"
    668                     +        "---------------------------------------------------------------------------------------------\n");
    669 
    670             int start;
    671             int end;
    672 
    673             if (around == -1) {
    674                 start = 0;
    675                 end = fString.length();
    676             } else {
    677                 // Display context around a failure.
    678                 try {
    679                     start = fString.offsetByCodePoints(around, -30);
    680                 } catch (Exception e) {
    681                     start = 0;
    682                 }
    683                 try {
    684                     end = fString.offsetByCodePoints(around, +30);
    685                 } catch (Exception e) {
    686                     end = fString.length();
    687                 }
    688             }
    689 
    690             for (int charIdx = start; charIdx < end; charIdx=fString.offsetByCodePoints(charIdx, 1)) {
    691                 int c = fString.codePointAt(charIdx);
    692                 CharClass cc = fBkRules.getClassForChar(c);
    693 
    694                 BreakRule rule = fBkRules.fBreakRules.get(fRuleForPosition[charIdx]);
    695                 String secondRuleName = "";
    696                 if (f2ndRuleForPos[charIdx] > 0) {
    697                     secondRuleName = fBkRules.fBreakRules.get(f2ndRuleForPos[charIdx]).fName;
    698                 }
    699                 String cName = UCharacterName.INSTANCE.getName(c, UCharacterNameChoice.EXTENDED_CHAR_NAME);
    700 
    701                 System.out.printf("  %4d %6x   %-20s  %c %c   %-10s %-10s    %s\n",
    702                         charIdx, c, cc.fName,
    703                         fExpectedBreaks[charIdx] ? '*' : '.',
    704                         fActualBreaks[charIdx] ? '*' : '.',
    705                         rule.fName, secondRuleName, cName
    706                         );
    707                 }
    708 
    709         };
    710 
    711         void clearActualBreaks() {
    712             Arrays.fill(fActualBreaks, false);
    713         }
    714 
    715 
    716         int               fRandomSeed;        // The initial seed value from the random number generator.
    717         BreakRules        fBkRules;           // The break rules used to generate this data.
    718         String            fString;            // The text.
    719         boolean           fExpectedBreaks[];  // Breaks as found by the reference rules.
    720                                               //     Parallel to fString. true if break preceding.
    721         boolean           fActualBreaks[];    // Breaks as found by ICU break iterator.
    722         int               fRuleForPosition[]; // Index into BreakRules.fBreakRules of rule that applied at each position.
    723                                               // Also parallel to fString.
    724         int               f2ndRuleForPos[];   // As above. A 2nd rule applies when the preceding rule
    725                                               //   didn't cause a break, and a subsequent rule match starts
    726                                               //   on the last code point of the preceding match.
    727 
    728     }
    729 
    730 
    731     // class RBBIMonkeyImpl     holds (some indirectly) everything associated with running a monkey
    732     //                          test for one set of break rules.
    733     //
    734 
    735     static class RBBIMonkeyImpl extends Thread {
    736 
    737         void setup(String ruleFile) {
    738             fRuleFileName = ruleFile;
    739             openBreakRules(ruleFile);
    740             fRuleSet = new BreakRules(this);
    741             fRuleSet.compileRules(fRuleCharBuffer);
    742             fBI = fRuleSet.createICUBreakIterator();
    743             fTestData = new MonkeyTestData();
    744         };
    745 
    746         void openBreakRules(String fileName) {
    747             StringBuilder testFileBuf = new StringBuilder();
    748             InputStream is = null;
    749             String filePath = "break_rules/" + fileName;
    750             try {
    751                 is = RBBIMonkeyImpl.class.getResourceAsStream(filePath);
    752                 if (is == null) {
    753                     errln("Could not open test data file " + fileName);
    754                     return;
    755                 }
    756                 InputStreamReader isr = new InputStreamReader(is, "UTF-8");
    757                 try {
    758                     int c;
    759                     int count = 0;
    760                     for (;;) {
    761                         c = isr.read();
    762                         if (c < 0) {
    763                             break;
    764                         }
    765                         count++;
    766                         if (c == 0xFEFF && count == 1) {
    767                             // BOM in the test data file. Discard it.
    768                             continue;
    769                         }
    770                        testFileBuf.appendCodePoint(c);
    771                     }
    772                 } finally {
    773                     isr.close();
    774                 }
    775                 } catch (IOException e) {
    776                 try {
    777                     is.close();
    778                 } catch (IOException ignored) {
    779                 }
    780                 errln(e.toString());
    781             }
    782             fRuleCharBuffer =  testFileBuf.toString();  /* the file as a String */
    783         }
    784 
    785         class MonkeyException extends RuntimeException  {
    786             private static final long serialVersionUID = 1L;
    787             public int fPosition;    // Position of the failure in the test data.
    788             MonkeyException(String description, int pos) {
    789                 super(description);
    790                 fPosition = pos;
    791             }
    792         }
    793 
    794         @Override
    795         public void run() {
    796             int errorCount = 0;
    797             if (fBI == null) {
    798                 fErrorMsgs.append("Unable to run test because fBI is null.\n");
    799                 return;
    800             }
    801             for (long loopCount = 0; fLoopCount < 0 || loopCount < fLoopCount; loopCount++) {
    802                 try {
    803                     fTestData.set(fRuleSet, fRandomGenerator);
    804                     // fTestData.dump(-1);
    805                     testForwards();
    806                     testPrevious();
    807                     testFollowing();
    808                     testPreceding();
    809                     testIsBoundary();
    810                 } catch (MonkeyException e) {
    811                     String formattedMsg = String.format(
    812                             "%s at index %d. VM Arguments to reproduce: -Drules=%s -Dseed=%d -Dloop=1 -Dverbose=1 \"\n",
    813                             e.getMessage(), e.fPosition, fRuleFileName, fTestData.fRandomSeed);
    814                     System.err.print(formattedMsg);
    815                     if (fVerbose) {
    816                         fTestData.dump(e.fPosition);
    817                     }
    818                     fErrorMsgs.append(formattedMsg);
    819                     if (++errorCount > 10) {
    820                         return;
    821                     }
    822                 }
    823                 if (fLoopCount < 0 && loopCount % 100 == 0) {
    824                     System.err.print(".");
    825                 }
    826             }
    827         }
    828 
    829         enum CheckDirection {
    830             FORWARD,
    831             REVERSE
    832         };
    833 
    834         void testForwards() {
    835             fTestData.clearActualBreaks();
    836             fBI.setText(fTestData.fString);
    837             int previousBreak = -2;
    838             for (int bk=fBI.first(); bk != BreakIterator.DONE; bk=fBI.next()) {
    839                 if (bk <= previousBreak) {
    840                     throw new MonkeyException("Break Iterator Stall", bk);
    841                 }
    842                 if (bk < 0 || bk > fTestData.fString.length()) {
    843                     throw new MonkeyException("Boundary out of bounds", bk);
    844                 }
    845                 fTestData.fActualBreaks[bk] = true;
    846             }
    847             checkResults("testForwards", CheckDirection.FORWARD);
    848         };
    849 
    850 
    851        void testFollowing() {
    852            fTestData.clearActualBreaks();
    853            fBI.setText(fTestData.fString);
    854            int nextBreak = -1;
    855            for (int i=-1 ; i<fTestData.fString.length(); ++i) {
    856                int bk = fBI.following(i);
    857                if (bk == BreakIterator.DONE && i == fTestData.fString.length()) {
    858                    continue;
    859                }
    860                if (bk == nextBreak && bk > i) {
    861                    // i is in the gap between two breaks.
    862                    continue;
    863                }
    864                if (i == nextBreak && bk > nextBreak) {
    865                    fTestData.fActualBreaks[bk] = true;
    866                    nextBreak = bk;
    867                    continue;
    868                }
    869                throw new MonkeyException("following(i)", i);
    870            }
    871            checkResults("testFollowing", CheckDirection.FORWARD);
    872         };
    873 
    874 
    875         void testPrevious() {
    876             fTestData.clearActualBreaks();
    877             fBI.setText(fTestData.fString);
    878             int previousBreak = Integer.MAX_VALUE;
    879             for (int bk=fBI.last(); bk != BreakIterator.DONE; bk=fBI.previous()) {
    880                  if (bk >= previousBreak) {
    881                      throw new MonkeyException("Break Iterator Stall", bk);
    882                 }
    883                 if (bk < 0 || bk > fTestData.fString.length()) {
    884                     throw new MonkeyException("Boundary out of bounds", bk);
    885                 }
    886                 fTestData.fActualBreaks[bk] = true;
    887             }
    888             checkResults("testPrevius", CheckDirection.REVERSE);
    889         };
    890 
    891 
    892         /**
    893          * Given an index into a string, if it refers to the trail surrogate of a surrogate pair,
    894          * adjust it to point to the lead surrogate, which is the start of the code point.
    895          * @param s the String.
    896          * @param i the initial index
    897          * @return the adjusted index
    898          */
    899         private int getChar32Start(String s, int i) {
    900             if (i > 0 && i < s.length() &&
    901                     Character.isLowSurrogate(s.charAt(i)) && Character.isHighSurrogate(s.charAt(i-1))) {
    902                 --i;
    903             }
    904             return i;
    905         }
    906 
    907 
    908         void testPreceding() {
    909             fTestData.clearActualBreaks();
    910             fBI.setText(fTestData.fString);
    911             int nextBreak = fTestData.fString.length()+1;
    912             for (int i=fTestData.fString.length()+1 ; i>=0; --i) {
    913                 int bk = fBI.preceding(i);
    914                 // System.err.printf("testPreceding() i:%d  bk:%d  nextBreak:%d\n", i, bk, nextBreak);
    915                 if (bk == BreakIterator.DONE && i == 0) {
    916                     continue;
    917                 }
    918                 if (bk == nextBreak && bk < i) {
    919                     // i is in the gap between two breaks.
    920                     continue;
    921                 }
    922                 if (i<fTestData.fString.length() && getChar32Start(fTestData.fString, i) < i) {
    923                     // i indexes to a trailing surrogate.
    924                     // Break Iterators treat an index to either half as referring to the supplemental code point,
    925                     // with preceding going to some preceding code point.
    926                     if (fBI.preceding(i) != fBI.preceding(getChar32Start(fTestData.fString, i))) {
    927                         throw new MonkeyException("preceding of trailing surrogate error", i);
    928                     }
    929                     continue;
    930                 }
    931                 if (i == nextBreak && bk < nextBreak) {
    932                     fTestData.fActualBreaks[bk] = true;
    933                     nextBreak = bk;
    934                     continue;
    935                 }
    936                 throw new MonkeyException("preceding(i)", i);
    937             }
    938             checkResults("testPreceding", CheckDirection.REVERSE);
    939 
    940         };
    941 
    942 
    943         void testIsBoundary() {
    944             fTestData.clearActualBreaks();
    945             fBI.setText(fTestData.fString);
    946             for (int i=fTestData.fString.length(); i>=0; --i) {
    947                 if (fBI.isBoundary(i)) {
    948                     fTestData.fActualBreaks[i] = true;
    949                 }
    950             }
    951             checkResults("testForwards", CheckDirection.FORWARD);
    952         };
    953 
    954 
    955         void checkResults(String msg, CheckDirection direction) {
    956             if (direction == CheckDirection.FORWARD) {
    957                 for (int i=0; i<=fTestData.fString.length(); ++i) {
    958                     if (fTestData.fExpectedBreaks[i] != fTestData.fActualBreaks[i]) {
    959                         throw new MonkeyException(msg, i);
    960                     }
    961                 }
    962             } else {
    963                 for (int i=fTestData.fString.length(); i>=0; i--) {
    964                     if (fTestData.fExpectedBreaks[i] != fTestData.fActualBreaks[i]) {
    965                         throw new MonkeyException(msg, i);
    966                     }
    967                 }
    968             }
    969 
    970         };
    971 
    972         String                 fRuleCharBuffer;         // source file contents of the reference rules.
    973         BreakRules             fRuleSet;
    974         RuleBasedBreakIterator fBI;
    975         MonkeyTestData         fTestData;
    976         ICU_Rand               fRandomGenerator;
    977         String                 fRuleFileName;
    978         boolean                fVerbose;                 // True to do long dump of failing data.
    979         int                    fLoopCount;
    980         int                    fErrorCount;
    981 
    982         boolean                fDumpExpansions;          // Debug flag to output expanded form of rules and sets.
    983         StringBuilder          fErrorMsgs = new StringBuilder();
    984 
    985     }
    986 
    987     //  Test parameters, specified via Java properties.
    988     //
    989     //  rules=file_name   Name of file containing the reference rules.
    990     //  seed=nnnnn        Random number starting seed.
    991     //                    Setting the seed allows errors to be reproduced.
    992     //  loop=nnn          Looping count.  Controls running time.
    993     //                    -1:  run forever.
    994     //                     0 or greater:  run length.
    995     //  expansions        debug option, show expansions of rules and sets.
    996     //  verbose           Display details of the failure.
    997     //
    998     // Parameters are passed to the JVM on the command line, or
    999     // via the Eclipse Run Configuration settings, arguments tab, VM parameters.
   1000     // For example,
   1001     //      -ea -Drules=line.txt -Dloop=-1
   1002     //
   1003     @Test
   1004     public void TestMonkey() {
   1005         String tests[] = {"grapheme.txt", "word.txt", "line.txt", "sentence.txt", "line_normal.txt",
   1006                 "line_normal_cj.txt", "line_loose.txt", "line_loose_cj.txt", "word_POSIX.txt"
   1007         };
   1008 
   1009         String testNameFromParams = getProperty("rules");
   1010 
   1011         if (testNameFromParams != null) {
   1012             tests = new String[] {testNameFromParams};
   1013         }
   1014 
   1015         int loopCount = getIntProperty("loop", isQuick() ? 100 : 5000);
   1016         boolean dumpExpansions =  getBooleanProperty("expansions", false);
   1017         boolean verbose = getBooleanProperty("verbose", false);
   1018         int seed = getIntProperty("seed", 1);
   1019 
   1020         List<RBBIMonkeyImpl> startedTests = new ArrayList<RBBIMonkeyImpl>();
   1021 
   1022         // Monkey testing is multi-threaded.
   1023         // Each set of break rules to be tested is run in a separate thread.
   1024         // Each thread/set of rules gets a separate RBBIMonkeyImpl object.
   1025 
   1026         for (String testName: tests) {
   1027             logln(String.format("beginning testing of %s", testName));
   1028 
   1029             RBBIMonkeyImpl test = new RBBIMonkeyImpl();
   1030 
   1031             test.fDumpExpansions = dumpExpansions;
   1032             test.fVerbose = verbose;
   1033             test.fRandomGenerator = new ICU_Rand(seed);
   1034             test.fLoopCount = loopCount;
   1035             test.setup(testName);
   1036 
   1037             test.start();
   1038             startedTests.add(test);
   1039         }
   1040 
   1041         StringBuilder errors = new StringBuilder();
   1042         for (RBBIMonkeyImpl test: startedTests) {
   1043             try {
   1044                 test.join();
   1045                 errors.append(test.fErrorMsgs);
   1046             } catch (InterruptedException e) {
   1047                 errors.append(e + "\n");
   1048             }
   1049         }
   1050         String errorMsgs = errors.toString();
   1051         assertEquals(errorMsgs, "", errorMsgs);
   1052 
   1053     }
   1054 
   1055 
   1056 }
   1057