1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 5 package android.icu.dev.test.rbbi; 6 7 import java.io.IOException; 8 import java.io.InputStream; 9 import java.io.InputStreamReader; 10 import java.util.ArrayList; 11 import java.util.Arrays; 12 import java.util.HashMap; 13 import java.util.List; 14 import java.util.Map; 15 import java.util.regex.Matcher; 16 import java.util.regex.Pattern; 17 import java.util.regex.PatternSyntaxException; 18 19 import org.junit.Test; 20 import org.junit.runner.RunWith; 21 import org.junit.runners.JUnit4; 22 23 import android.icu.dev.test.TestFmwk; 24 import android.icu.impl.UCharacterName; 25 import android.icu.impl.UCharacterNameChoice; 26 import android.icu.text.BreakIterator; 27 import android.icu.text.RuleBasedBreakIterator; 28 import android.icu.text.UnicodeSet; 29 import android.icu.util.ULocale; 30 import android.icu.testsharding.MainTestShard; 31 32 /** 33 * RBBI Monkey Test. Ported from ICU4C test/intltest/rbbimonkeytest.cpp. 34 * This is the newer, data driven monkey test. It is completely separate from the 35 * older class RBBITestMonkey. 36 */ 37 38 @MainTestShard 39 @RunWith(JUnit4.class) 40 public class RBBIMonkeyTest extends TestFmwk { 41 42 43 // class CharClass Represents a single character class from the source break rules. 44 // Inherits from UObject because instances are adopted by UHashtable, which ultimately 45 // deletes them using hash's object deleter function. 46 47 static class CharClass { 48 String fName; 49 String fOriginalDef; // set definition as it appeared in user supplied rules. 50 String fExpandedDef; // set definition with any embedded named sets replaced by their defs, recursively. 51 UnicodeSet fSet; 52 CharClass(String name, String originalDef, String expandedDef, UnicodeSet set) { 53 fName = name; 54 fOriginalDef = originalDef; 55 fExpandedDef = expandedDef; 56 fSet = set; 57 }; 58 } 59 60 61 // class BreakRule Struct-like class represents a single rule from a set of break rules. 62 // Each rule has the set definitions expanded, and 63 // is compiled to a regular expression. 64 65 static class BreakRule { 66 String fName; // Name of the rule. 67 String fRule; // Rule expression, excluding the name, as written in user source. 68 String fExpandedRule; // Rule expression after expanding the set definitions. 69 Matcher fRuleMatcher; // Regular expression that matches the rule. 70 }; 71 72 73 // class BreakRules represents a complete set of break rules, possibly tailored, 74 // compiled from testdata break rules. 75 76 static class BreakRules { 77 BreakRules(RBBIMonkeyImpl monkeyImpl) { 78 fMonkeyImpl = monkeyImpl; 79 fBreakRules = new ArrayList<BreakRule>(); 80 fType = BreakIterator.KIND_TITLE; 81 fCharClasses = new HashMap<String, CharClass>(); 82 fCharClassList = new ArrayList<CharClass>(); 83 fDictionarySet = new UnicodeSet(); 84 85 // Match an alpha-numeric identifier in a rule. Will be a set name. 86 // Use negative look-behind to exclude non-identifiers, mostly property names or values. 87 fSetRefsMatcher = Pattern.compile( 88 "(?<!\\{[ \\t]{0,4})" + 89 "(?<!=[ \\t]{0,4})" + 90 "(?<!\\[:[ \\t]{0,4})" + 91 "(?<!\\\\)" + 92 "(?<![A-Za-z0-9_])" + 93 "([A-Za-z_][A-Za-z0-9_]*)"). // The char class name 94 matcher(""); 95 96 // Match comments and blank lines. Matches will be replaced with "", stripping the comments from the rules. 97 fCommentsMatcher = Pattern.compile("" + 98 "(^|(?<=;))" + // Start either at start of line, or just after a ';' (look-behind for ';') 99 "[ \\t]*+" + // Match white space. 100 "(#.*)?+" + // Optional # plus whatever follows 101 "$"). // new-line at end of line. 102 matcher(""); 103 104 // Match (initial parse) of a character class definition line. 105 fClassDefMatcher = Pattern.compile("" + 106 "[ \\t]*" + // leading white space 107 "([A-Za-z_][A-Za-z0-9_]*)" + // The char class name 108 "[ \\t]*=[ \\t]*" + // = 109 "(.*?)" + // The char class UnicodeSet expression 110 "[ \\t]*;$"). // ; <end of line> 111 matcher(""); 112 113 // Match (initial parse) of a break rule line. 114 fRuleDefMatcher = Pattern.compile("" + 115 "[ \\t]*" + // leading white space 116 "([A-Za-z_][A-Za-z0-9_.]*)" + // The rule name 117 "[ \\t]*:[ \\t]*" + // : 118 "(.*?)" + // The rule definition 119 "[ \\t]*;$"). // ; <end of line> 120 matcher(""); 121 122 // Match a property expression, either [:xxx:] or \p{...} 123 fPropertyMatcher = Pattern.compile("" + 124 "\\[:.*?:]|\\\\(?:p|P)\\{.*?\\}"). 125 matcher(""); 126 127 128 } 129 130 /** 131 * Create the expanded definition for this char class, 132 * replacing any set references with the corresponding definition. 133 */ 134 CharClass addCharClass(String name, String definition) { 135 StringBuffer expandedDef = new StringBuffer(); 136 fSetRefsMatcher.reset(definition); 137 while (fSetRefsMatcher.find()) { 138 String sname = fSetRefsMatcher.group(/*"ClassName"*/ 1); 139 CharClass snameClass = fCharClasses.get(sname); 140 String expansionForName = snameClass != null ? snameClass.fExpandedDef : sname; 141 142 fSetRefsMatcher.appendReplacement(expandedDef, ""); 143 expandedDef.append(expansionForName); 144 } 145 fSetRefsMatcher.appendTail(expandedDef); 146 String expandedDefString = expandedDef.toString(); 147 148 if (fMonkeyImpl.fDumpExpansions) { 149 System.out.printf("addCharClass(\"%s\"\n", name); 150 System.out.printf(" %s\n", definition); 151 System.out.printf("expandedDef: %s\n", expandedDefString); 152 } 153 154 // Verify that the expanded set definition is valid. 155 156 UnicodeSet s; 157 try { 158 s = new UnicodeSet(expandedDefString, UnicodeSet.IGNORE_SPACE); 159 } catch (java.lang.IllegalArgumentException e) { 160 System.err.printf("%s: error %s creating UnicodeSet %s", fMonkeyImpl.fRuleFileName, e.toString(), name); 161 throw e; 162 } 163 164 // Get an expanded equivalent pattern from the UnicodeSet. 165 // This removes set difference operators, which would fail if passed through to Java regex. 166 167 StringBuffer expandedPattern = new StringBuffer(); 168 s._generatePattern(expandedPattern, true); 169 expandedDefString = expandedPattern.toString(); 170 if (fMonkeyImpl.fDumpExpansions) { 171 System.out.printf("expandedDef2: %s\n", expandedDefString); 172 } 173 174 CharClass cclass = new CharClass(name, definition, expandedDefString, s); 175 CharClass previousClass = fCharClasses.put(name, cclass); 176 177 if (previousClass != null) { 178 // TODO: decide whether or not to allow redefinitions. 179 // Can be convenient in some cases. 180 // String msg = String.format("%s: Redefinition of character class %s\n", 181 // fMonkeyImpl.fRuleFileName, cclass.fName); 182 // System.err.println(msg); 183 // throw new IllegalArgumentException(msg); 184 } 185 return cclass; 186 187 }; 188 189 190 void addRule(String name, String definition) { 191 BreakRule thisRule = new BreakRule(); 192 StringBuffer expandedDefsRule = new StringBuffer(); 193 thisRule.fName = name; 194 thisRule.fRule = definition; 195 196 // Expand the char class definitions within the rule. 197 fSetRefsMatcher.reset(definition); 198 while (fSetRefsMatcher.find()) { 199 String sname = fSetRefsMatcher.group(/*"ClassName"*/ 1); 200 CharClass nameClass = fCharClasses.get(sname); 201 if (nameClass == null) { 202 System.err.printf("char class \"%s\" unrecognized in rule \"%s\"\n", sname, definition); 203 } 204 String expansionForName = nameClass != null ? nameClass.fExpandedDef : sname; 205 fSetRefsMatcher.appendReplacement(expandedDefsRule, ""); 206 expandedDefsRule.append(expansionForName); 207 } 208 fSetRefsMatcher.appendTail(expandedDefsRule); 209 210 // Replace any property expressions, \p{...} or [:...:] with an equivalent expansion, 211 // obtained from ICU UnicodeSet. Need to do this substitution because Java regex 212 // does not recognize all properties, and because Java's definitions are likely 213 // older than ICU's. 214 215 StringBuffer expandedRule = new StringBuffer(); 216 fPropertyMatcher.reset(expandedDefsRule); 217 while (fPropertyMatcher.find()) { 218 String prop = fPropertyMatcher.group(); 219 UnicodeSet propSet = new UnicodeSet("[" + prop + "]"); 220 StringBuffer propExpansion = new StringBuffer(); 221 propSet._generatePattern(propExpansion, true); 222 fPropertyMatcher.appendReplacement(expandedRule, propExpansion.toString()); 223 } 224 fPropertyMatcher.appendTail(expandedRule); 225 226 // Replace any [^negated sets] with equivalent flattened sets generated by 227 // ICU UnicodeSet. [^ ...] in Java Regex character classes does not apply 228 // to any nested classes. Variable substitution in rules produces 229 // nested sets that [^negation] needs to apply to. 230 231 StringBuffer ruleWithFlattenedSets = new StringBuffer(); 232 int idx = 0; 233 while (idx<expandedRule.length()) { 234 int setOpenPos = expandedRule.indexOf("[^", idx); 235 if (setOpenPos < 0) { 236 break; 237 } 238 if (setOpenPos > idx) { 239 // Move anything from the source rule preceding the [^ into the processed rule, unchanged. 240 ruleWithFlattenedSets.append(expandedRule.substring(idx, setOpenPos)); 241 } 242 int nestingLevel = 1; 243 boolean haveNesting = false; 244 int setClosePos; 245 for (setClosePos = setOpenPos + 2; nestingLevel > 0 && setClosePos<expandedRule.length(); ++setClosePos) { 246 char c = expandedRule.charAt(setClosePos); 247 if (c == '\\') { 248 ++setClosePos; 249 } else if (c == '[') { 250 ++nestingLevel; 251 haveNesting = true; 252 } else if (c == ']') { 253 --nestingLevel; 254 } 255 } 256 if (haveNesting && nestingLevel == 0) { 257 // Found one, a negated set that includes interior nested sets. 258 // Create an ICU UnicodeSet from the source pattern, and obtain an 259 // equivalent flattened pattern from that. 260 UnicodeSet uset = new UnicodeSet(expandedRule.substring(setOpenPos, setClosePos), true); 261 uset._generatePattern(ruleWithFlattenedSets, true); 262 } else { 263 // The [^ set definition did not include any nested sets. 264 // Copy the original definition without change. 265 // Java regular expressions will handle it without needing to recast it. 266 if (nestingLevel > 0) { 267 // Error case of an unclosed character class expression. 268 // Java regex will also eventually flag the error. 269 System.err.printf("No closing ] found in rule %s\n", name); 270 } 271 ruleWithFlattenedSets.append(expandedRule.substring(setOpenPos, setClosePos)); 272 } 273 idx = setClosePos; 274 } 275 276 if (idx < expandedRule.length()) { 277 ruleWithFlattenedSets.append(expandedRule.substring(idx, expandedRule.length())); 278 } 279 280 thisRule.fExpandedRule = ruleWithFlattenedSets.toString(); 281 282 // Replace the divide sign (\u00f7) with a regular expression named capture. 283 // When running the rules, a match that includes this group means we found a break position. 284 285 // thisRule.fExpandedRule = thisRule.fExpandedRule.replace("", "(?<BreakPosition>)"); 286 thisRule.fExpandedRule = thisRule.fExpandedRule.replace("", "()"); 287 if (thisRule.fExpandedRule.indexOf("") != -1) { 288 String msg = String.format("%s Rule %s contains multiple signs", fMonkeyImpl.fRuleFileName, name); 289 System.err.println(msg); 290 throw new IllegalArgumentException(msg); 291 } 292 293 // UAX break rule set definitions can be empty, just []. 294 // Regular expression set expressions don't accept this. Substitute with [a&&[^a]], which 295 // also matches nothing. 296 297 thisRule.fExpandedRule = thisRule.fExpandedRule.replace("[]", "[a&&[^a]]"); 298 299 // Change Unicode escape syntax for compatibility with Java regular expressions (Java 7 or newer) 300 // \udddd => \x{dddd} 301 // \U00hhhhhh => \x{hhhhhh} 302 303 // thisRule.fExpandedRule = thisRule.fExpandedRule.replaceAll("\\\\u([0-9A-Fa-f]{4})", "\\\\x{$1}"); 304 // thisRule.fExpandedRule = thisRule.fExpandedRule.replaceAll("\\\\U00([0-9A-Fa-f]{6})", "\\\\x{$1}"); 305 306 // Java 6 compatibility troubles - there is no syntax for escaping a supplementary character 307 // within a regular expression character class. Put them in as unescaped literal chars. 308 StringBuilder sb = new StringBuilder(thisRule.fExpandedRule); 309 while (true) { 310 int where = sb.indexOf("\\U00"); 311 if (where < 0) { 312 break; 313 } 314 String cp = hexToCodePoint(sb.substring(where+2, where+10)); 315 sb.replace(where, where+10, cp); 316 } 317 thisRule.fExpandedRule = sb.toString(); 318 319 // Escape any literal '#' in the rule expression. Without escaping, these introduce a comment. 320 // UnicodeSet._generatePattern() inserts un-escaped "#"s 321 322 thisRule.fExpandedRule = thisRule.fExpandedRule.replace("#", "\\#"); 323 if (fMonkeyImpl.fDumpExpansions) { 324 System.out.printf("fExpandedRule: %s\n", thisRule.fExpandedRule); 325 } 326 327 // Compile a regular expression for this rule. 328 329 try { 330 thisRule.fRuleMatcher = Pattern.compile(thisRule.fExpandedRule, Pattern.COMMENTS | Pattern.DOTALL).matcher(""); 331 } catch (PatternSyntaxException e) { 332 System.err.printf("%s: Error creating regular expression for rule %s. Expansion is \n\"%s\"", 333 fMonkeyImpl.fRuleFileName, name, thisRule.fExpandedRule); 334 throw e; 335 } 336 337 // Put this new rule into the vector of all Rules. 338 339 fBreakRules.add(thisRule); 340 }; 341 342 private static String hexToCodePoint(String hex) { 343 int cp = Integer.parseInt(hex, 16); 344 return new StringBuilder().appendCodePoint(cp).toString(); 345 } 346 347 348 boolean setKeywordParameter(String keyword, String value) { 349 if (keyword.equals("locale")) { 350 fLocale = new ULocale(value); 351 return true; 352 } 353 if (keyword.equals("type")) { 354 if (value.equals("grapheme")) { 355 fType = BreakIterator.KIND_CHARACTER; 356 } else if (value.equals("word")) { 357 fType = BreakIterator.KIND_WORD; 358 } else if (value.equals("line")) { 359 fType = BreakIterator.KIND_LINE; 360 } else if (value.equals("sentence")) { 361 fType = BreakIterator.KIND_SENTENCE; 362 } else { 363 String msg = String.format("%s: Unrecognized break type %s", fMonkeyImpl.fRuleFileName, value); 364 System.err.println(msg); 365 throw new IllegalArgumentException(msg); 366 } 367 return true; 368 } 369 return false; 370 } 371 372 373 RuleBasedBreakIterator createICUBreakIterator() { 374 BreakIterator bi; 375 switch(fType) { 376 case BreakIterator.KIND_CHARACTER: 377 bi = (BreakIterator.getCharacterInstance(fLocale)); 378 break; 379 case BreakIterator.KIND_WORD: 380 bi = (BreakIterator.getWordInstance(fLocale)); 381 break; 382 case BreakIterator.KIND_LINE: 383 bi = (BreakIterator.getLineInstance(fLocale)); 384 break; 385 case BreakIterator.KIND_SENTENCE: 386 bi = (BreakIterator.getSentenceInstance(fLocale)); 387 break; 388 default: 389 String msg = String.format("%s: Bad break iterator type of %d", fMonkeyImpl.fRuleFileName, fType); 390 System.err.println(msg); 391 throw new IllegalArgumentException(msg); 392 } 393 return (RuleBasedBreakIterator)bi; 394 395 }; 396 397 398 399 void compileRules(String rules) { 400 int lineNumber = 0; 401 for (String line: rules.split("\\r?\\n")) { 402 ++lineNumber; 403 // Strip comment lines. 404 fCommentsMatcher.reset(line); 405 line = fCommentsMatcher.replaceFirst(""); 406 if (line.isEmpty()) { 407 continue; 408 } 409 410 // Recognize character class definition and keyword lines 411 fClassDefMatcher.reset(line); 412 if (fClassDefMatcher.matches()) { 413 String className = fClassDefMatcher.group(/*"ClassName"*/ 1); 414 String classDef = fClassDefMatcher.group(/*"ClassDef"*/ 2); 415 if (fMonkeyImpl.fDumpExpansions) { 416 System.out.printf("scanned class: %s = %s\n", className, classDef); 417 } 418 if (setKeywordParameter(className, classDef)) { 419 // The scanned item was "type = ..." or "locale = ...", etc. 420 // which are not actual character classes. 421 continue; 422 } 423 addCharClass(className, classDef); 424 continue; 425 } 426 427 // Recognize rule lines. 428 fRuleDefMatcher.reset(line); 429 if (fRuleDefMatcher.matches()) { 430 String ruleName = fRuleDefMatcher.group(/*"RuleName"*/ 1); 431 String ruleDef = fRuleDefMatcher.group(/*"RuleDef"*/ 2); 432 if (fMonkeyImpl.fDumpExpansions) { 433 System.out.printf("scanned rule: %s : %s\n", ruleName, ruleDef); 434 } 435 addRule(ruleName, ruleDef); 436 continue; 437 } 438 439 String msg = String.format("Unrecognized line in rule file %s:%d \"%s\"", 440 fMonkeyImpl.fRuleFileName, lineNumber, line); 441 System.err.println(msg); 442 throw new IllegalArgumentException(msg); 443 } 444 445 // Build the vector of char classes, omitting the dictionary class if there is one. 446 // This will be used when constructing the random text to be tested. 447 448 // Also compute the "other" set, consisting of any characters not included in 449 // one or more of the user defined sets. 450 451 UnicodeSet otherSet = new UnicodeSet(0, 0x10ffff); 452 453 for (Map.Entry<String, CharClass> el: fCharClasses.entrySet()) { 454 String ccName = el.getKey(); 455 CharClass cclass = el.getValue(); 456 457 // System.out.printf(" Adding %s\n", ccName); 458 if (!ccName.equals(cclass.fName)) { 459 throw new IllegalArgumentException( 460 String.format("%s: internal error, set names (%s, %s) inconsistent.\n", 461 fMonkeyImpl.fRuleFileName, ccName, cclass.fName)); 462 } 463 otherSet.removeAll(cclass.fSet); 464 if (ccName.equals("dictionary")) { 465 fDictionarySet = cclass.fSet; 466 } else { 467 fCharClassList.add(cclass); 468 } 469 } 470 471 if (!otherSet.isEmpty()) { 472 // System.out.printf("have an other set.\n"); 473 CharClass cclass = addCharClass("__Others", otherSet.toPattern(true)); 474 fCharClassList.add(cclass); 475 } 476 477 }; 478 479 CharClass getClassForChar(int c) { 480 for (CharClass cc: fCharClassList) { 481 if (cc.fSet.contains(c)) { 482 return cc; 483 } 484 } 485 return null; 486 }; 487 488 489 RBBIMonkeyImpl fMonkeyImpl; // Pointer back to the owning MonkeyImpl instance. 490 List<BreakRule> fBreakRules; // Contents are of type (BreakRule *). 491 492 Map<String, CharClass> fCharClasses; // Key is the set name. 493 // // Value is the corresponding CharClass 494 List<CharClass> fCharClassList; // Char Classes, same contents as fCharClasses values, 495 496 UnicodeSet fDictionarySet; // Dictionary set, empty if none is defined. 497 ULocale fLocale; 498 int fType; // BreakItererator.KIND_WORD, etc. 499 500 501 Matcher fSetRefsMatcher; 502 Matcher fCommentsMatcher; 503 Matcher fClassDefMatcher; 504 Matcher fRuleDefMatcher; 505 Matcher fPropertyMatcher; 506 }; 507 508 509 510 511 // class MonkeyTestData represents a randomly synthesized test data string together 512 // with the expected break positions obtained by applying 513 // the test break rules. 514 515 static class MonkeyTestData{ 516 517 void set(BreakRules rules, ICU_Rand rand) { 518 int dataLength = 1000; // length of test data to generate, in code points. 519 520 // Fill the test string with random characters. 521 // First randomly pick a char class, then randomly pick a character from that class. 522 // Exclude any characters from the dictionary set. 523 524 // System.out.println("Populating Test Data"); 525 fRandomSeed = rand.getSeed(); // Save initial seed for use in error messages, 526 // allowing recreation of failing data. 527 fBkRules = rules; 528 StringBuilder newString = new StringBuilder(); 529 for (int n=0; n<dataLength;) { 530 int charClassIndex = rand.next() % rules.fCharClassList.size(); 531 CharClass cclass = rules.fCharClassList.get(charClassIndex); 532 if (cclass.fSet.size() == 0) { 533 // Some rules or tailorings do end up with empty char classes. 534 continue; 535 } 536 int charIndex = rand.next() % cclass.fSet.size(); 537 int c = cclass.fSet.charAt(charIndex); 538 if (/*Character.isBmpCodePoint(c)*/ c<=0x0ffff && Character.isLowSurrogate((char)c) && 539 newString.length() > 0 && Character.isHighSurrogate(newString.charAt(newString.length()-1))) { 540 // Character classes may contain unpaired surrogates, e.g. Grapheme_Cluster_Break = Control. 541 // Don't let random unpaired surrogates combine in the test data because they might 542 // produce an unwanted dictionary character. 543 continue; 544 } 545 546 if (!rules.fDictionarySet.contains(c)) { 547 newString.appendCodePoint(c); 548 ++n; 549 } 550 } 551 fString = newString.toString(); 552 553 // Init the expectedBreaks, actualBreaks and ruleForPosition. 554 // Expected and Actual breaks are one longer than the input string; a true value 555 // will indicate a boundary preceding that position. 556 557 fActualBreaks = new boolean[fString.length()+1]; 558 fExpectedBreaks = new boolean[fString.length()+1]; 559 fRuleForPosition = new int[fString.length()+1]; 560 f2ndRuleForPos = new int[fString.length()+1]; 561 562 // Apply reference rules to find the expected breaks. 563 564 fExpectedBreaks[0] = true; // Force an expected break before the start of the text. 565 // ICU always reports a break there. 566 // The reference rules do not have a means to do so. 567 int strIdx = 0; 568 while (strIdx < fString.length()) { 569 BreakRule matchingRule = null; 570 boolean hasBreak = false; 571 int ruleNum = 0; 572 int matchStart = 0; 573 int matchEnd = 0; 574 for (ruleNum=0; ruleNum<rules.fBreakRules.size(); ruleNum++) { 575 BreakRule rule = rules.fBreakRules.get(ruleNum); 576 rule.fRuleMatcher.reset(fString.substring(strIdx)); 577 if (rule.fRuleMatcher.lookingAt()) { 578 // A candidate rule match, check further to see if we take it or continue to check other rules. 579 // Matches of zero or one code point count only if they also specify a break. 580 matchStart = strIdx; 581 matchEnd = strIdx + rule.fRuleMatcher.end(); 582 hasBreak = BreakGroupStart(rule.fRuleMatcher) >= 0; 583 if (hasBreak || 584 (matchStart < fString.length() && fString.offsetByCodePoints(matchStart, 1) < matchEnd)) { 585 matchingRule = rule; 586 break; 587 } 588 } 589 } 590 if (matchingRule == null) { 591 // No reference rule matched. This is an error in the rules that should never happen. 592 String msg = String.format("%s: No reference rules matched at position %d. ", 593 rules.fMonkeyImpl.fRuleFileName, strIdx); 594 System.err.println(msg); 595 dump(strIdx); 596 throw new IllegalArgumentException(msg); 597 } 598 if (matchingRule.fRuleMatcher.group().length() == 0) { 599 // Zero length rule match. This is also an error in the rule expressions. 600 String msg = String.format("%s:%s: Zero length rule match at %d.", 601 rules.fMonkeyImpl.fRuleFileName, matchingRule.fName, strIdx); 602 System.err.println(msg); 603 dump(strIdx); 604 throw new IllegalArgumentException(msg); 605 } 606 607 // Record which rule matched over the length of the match. 608 for (int i = matchStart; i < matchEnd; i++) { 609 if (fRuleForPosition[i] == 0) { 610 fRuleForPosition[i] = ruleNum; 611 } else { 612 f2ndRuleForPos[i] = ruleNum; 613 } 614 } 615 616 // Break positions appear in rules as a matching named capture of zero length at the break position, 617 // the adjusted pattern contains (?<BreakPosition>) 618 if (hasBreak) { 619 int breakPos = strIdx + BreakGroupStart(matchingRule.fRuleMatcher); 620 fExpectedBreaks[breakPos] = true; 621 // System.out.printf("recording break at %d\n", breakPos); 622 // For the next iteration, pick up applying rules immediately after the break, 623 // which may differ from end of the match. The matching rule may have included 624 // context following the boundary that needs to be looked at again. 625 strIdx = breakPos; 626 } else { 627 // Original rule didn't specify a break. 628 // Continue applying rules starting on the last code point of this match. 629 int updatedStrIdx = fString.offsetByCodePoints(matchEnd, -1); 630 if (updatedStrIdx == matchStart) { 631 // Match was only one code point, no progress if we continue. 632 // Shouldn't get here, case is filtered out at top of loop. 633 throw new IllegalArgumentException(String.format("%s: Rule %s internal error.", 634 rules.fMonkeyImpl.fRuleFileName, matchingRule.fName)); 635 } 636 strIdx = updatedStrIdx; 637 } 638 } 639 }; 640 641 // Helper function to find the starting index of a match of the "BreakPosition" named capture group. 642 // @param m: a Java regex Matcher that has completed a matching operation. 643 // @return m.start("BreakPosition), 644 // or -1 if there is no such group, or the group did not participate in the match. 645 // 646 // TODO: this becomes m.start("BreakPosition") with Java 8. 647 // In the mean time, assume that the only zero-length capturing group in 648 // a reference rule expression is the "BreakPosition" that corresponds to a "". 649 650 static int BreakGroupStart(Matcher m) { 651 for (int groupNum=1; groupNum <= m.groupCount(); ++groupNum) { 652 String group = m.group(groupNum); 653 if (group == null) { 654 continue; 655 } 656 if (group.equals("")) { 657 // assert(m.end(groupNum) == m.end("BreakPosition")); 658 return m.start(groupNum); 659 } 660 } 661 return -1; 662 } 663 664 void dump(int around) { 665 System.out.print("\n" 666 + " char break Rule Character\n" 667 + " pos code class R I name name\n" 668 + "---------------------------------------------------------------------------------------------\n"); 669 670 int start; 671 int end; 672 673 if (around == -1) { 674 start = 0; 675 end = fString.length(); 676 } else { 677 // Display context around a failure. 678 try { 679 start = fString.offsetByCodePoints(around, -30); 680 } catch (Exception e) { 681 start = 0; 682 } 683 try { 684 end = fString.offsetByCodePoints(around, +30); 685 } catch (Exception e) { 686 end = fString.length(); 687 } 688 } 689 690 for (int charIdx = start; charIdx < end; charIdx=fString.offsetByCodePoints(charIdx, 1)) { 691 int c = fString.codePointAt(charIdx); 692 CharClass cc = fBkRules.getClassForChar(c); 693 694 BreakRule rule = fBkRules.fBreakRules.get(fRuleForPosition[charIdx]); 695 String secondRuleName = ""; 696 if (f2ndRuleForPos[charIdx] > 0) { 697 secondRuleName = fBkRules.fBreakRules.get(f2ndRuleForPos[charIdx]).fName; 698 } 699 String cName = UCharacterName.INSTANCE.getName(c, UCharacterNameChoice.EXTENDED_CHAR_NAME); 700 701 System.out.printf(" %4d %6x %-20s %c %c %-10s %-10s %s\n", 702 charIdx, c, cc.fName, 703 fExpectedBreaks[charIdx] ? '*' : '.', 704 fActualBreaks[charIdx] ? '*' : '.', 705 rule.fName, secondRuleName, cName 706 ); 707 } 708 709 }; 710 711 void clearActualBreaks() { 712 Arrays.fill(fActualBreaks, false); 713 } 714 715 716 int fRandomSeed; // The initial seed value from the random number generator. 717 BreakRules fBkRules; // The break rules used to generate this data. 718 String fString; // The text. 719 boolean fExpectedBreaks[]; // Breaks as found by the reference rules. 720 // Parallel to fString. true if break preceding. 721 boolean fActualBreaks[]; // Breaks as found by ICU break iterator. 722 int fRuleForPosition[]; // Index into BreakRules.fBreakRules of rule that applied at each position. 723 // Also parallel to fString. 724 int f2ndRuleForPos[]; // As above. A 2nd rule applies when the preceding rule 725 // didn't cause a break, and a subsequent rule match starts 726 // on the last code point of the preceding match. 727 728 } 729 730 731 // class RBBIMonkeyImpl holds (some indirectly) everything associated with running a monkey 732 // test for one set of break rules. 733 // 734 735 static class RBBIMonkeyImpl extends Thread { 736 737 void setup(String ruleFile) { 738 fRuleFileName = ruleFile; 739 openBreakRules(ruleFile); 740 fRuleSet = new BreakRules(this); 741 fRuleSet.compileRules(fRuleCharBuffer); 742 fBI = fRuleSet.createICUBreakIterator(); 743 fTestData = new MonkeyTestData(); 744 }; 745 746 void openBreakRules(String fileName) { 747 StringBuilder testFileBuf = new StringBuilder(); 748 InputStream is = null; 749 String filePath = "break_rules/" + fileName; 750 try { 751 is = RBBIMonkeyImpl.class.getResourceAsStream(filePath); 752 if (is == null) { 753 errln("Could not open test data file " + fileName); 754 return; 755 } 756 InputStreamReader isr = new InputStreamReader(is, "UTF-8"); 757 try { 758 int c; 759 int count = 0; 760 for (;;) { 761 c = isr.read(); 762 if (c < 0) { 763 break; 764 } 765 count++; 766 if (c == 0xFEFF && count == 1) { 767 // BOM in the test data file. Discard it. 768 continue; 769 } 770 testFileBuf.appendCodePoint(c); 771 } 772 } finally { 773 isr.close(); 774 } 775 } catch (IOException e) { 776 try { 777 is.close(); 778 } catch (IOException ignored) { 779 } 780 errln(e.toString()); 781 } 782 fRuleCharBuffer = testFileBuf.toString(); /* the file as a String */ 783 } 784 785 class MonkeyException extends RuntimeException { 786 private static final long serialVersionUID = 1L; 787 public int fPosition; // Position of the failure in the test data. 788 MonkeyException(String description, int pos) { 789 super(description); 790 fPosition = pos; 791 } 792 } 793 794 @Override 795 public void run() { 796 int errorCount = 0; 797 if (fBI == null) { 798 fErrorMsgs.append("Unable to run test because fBI is null.\n"); 799 return; 800 } 801 for (long loopCount = 0; fLoopCount < 0 || loopCount < fLoopCount; loopCount++) { 802 try { 803 fTestData.set(fRuleSet, fRandomGenerator); 804 // fTestData.dump(-1); 805 testForwards(); 806 testPrevious(); 807 testFollowing(); 808 testPreceding(); 809 testIsBoundary(); 810 } catch (MonkeyException e) { 811 String formattedMsg = String.format( 812 "%s at index %d. VM Arguments to reproduce: -Drules=%s -Dseed=%d -Dloop=1 -Dverbose=1 \"\n", 813 e.getMessage(), e.fPosition, fRuleFileName, fTestData.fRandomSeed); 814 System.err.print(formattedMsg); 815 if (fVerbose) { 816 fTestData.dump(e.fPosition); 817 } 818 fErrorMsgs.append(formattedMsg); 819 if (++errorCount > 10) { 820 return; 821 } 822 } 823 if (fLoopCount < 0 && loopCount % 100 == 0) { 824 System.err.print("."); 825 } 826 } 827 } 828 829 enum CheckDirection { 830 FORWARD, 831 REVERSE 832 }; 833 834 void testForwards() { 835 fTestData.clearActualBreaks(); 836 fBI.setText(fTestData.fString); 837 int previousBreak = -2; 838 for (int bk=fBI.first(); bk != BreakIterator.DONE; bk=fBI.next()) { 839 if (bk <= previousBreak) { 840 throw new MonkeyException("Break Iterator Stall", bk); 841 } 842 if (bk < 0 || bk > fTestData.fString.length()) { 843 throw new MonkeyException("Boundary out of bounds", bk); 844 } 845 fTestData.fActualBreaks[bk] = true; 846 } 847 checkResults("testForwards", CheckDirection.FORWARD); 848 }; 849 850 851 void testFollowing() { 852 fTestData.clearActualBreaks(); 853 fBI.setText(fTestData.fString); 854 int nextBreak = -1; 855 for (int i=-1 ; i<fTestData.fString.length(); ++i) { 856 int bk = fBI.following(i); 857 if (bk == BreakIterator.DONE && i == fTestData.fString.length()) { 858 continue; 859 } 860 if (bk == nextBreak && bk > i) { 861 // i is in the gap between two breaks. 862 continue; 863 } 864 if (i == nextBreak && bk > nextBreak) { 865 fTestData.fActualBreaks[bk] = true; 866 nextBreak = bk; 867 continue; 868 } 869 throw new MonkeyException("following(i)", i); 870 } 871 checkResults("testFollowing", CheckDirection.FORWARD); 872 }; 873 874 875 void testPrevious() { 876 fTestData.clearActualBreaks(); 877 fBI.setText(fTestData.fString); 878 int previousBreak = Integer.MAX_VALUE; 879 for (int bk=fBI.last(); bk != BreakIterator.DONE; bk=fBI.previous()) { 880 if (bk >= previousBreak) { 881 throw new MonkeyException("Break Iterator Stall", bk); 882 } 883 if (bk < 0 || bk > fTestData.fString.length()) { 884 throw new MonkeyException("Boundary out of bounds", bk); 885 } 886 fTestData.fActualBreaks[bk] = true; 887 } 888 checkResults("testPrevius", CheckDirection.REVERSE); 889 }; 890 891 892 /** 893 * Given an index into a string, if it refers to the trail surrogate of a surrogate pair, 894 * adjust it to point to the lead surrogate, which is the start of the code point. 895 * @param s the String. 896 * @param i the initial index 897 * @return the adjusted index 898 */ 899 private int getChar32Start(String s, int i) { 900 if (i > 0 && i < s.length() && 901 Character.isLowSurrogate(s.charAt(i)) && Character.isHighSurrogate(s.charAt(i-1))) { 902 --i; 903 } 904 return i; 905 } 906 907 908 void testPreceding() { 909 fTestData.clearActualBreaks(); 910 fBI.setText(fTestData.fString); 911 int nextBreak = fTestData.fString.length()+1; 912 for (int i=fTestData.fString.length()+1 ; i>=0; --i) { 913 int bk = fBI.preceding(i); 914 // System.err.printf("testPreceding() i:%d bk:%d nextBreak:%d\n", i, bk, nextBreak); 915 if (bk == BreakIterator.DONE && i == 0) { 916 continue; 917 } 918 if (bk == nextBreak && bk < i) { 919 // i is in the gap between two breaks. 920 continue; 921 } 922 if (i<fTestData.fString.length() && getChar32Start(fTestData.fString, i) < i) { 923 // i indexes to a trailing surrogate. 924 // Break Iterators treat an index to either half as referring to the supplemental code point, 925 // with preceding going to some preceding code point. 926 if (fBI.preceding(i) != fBI.preceding(getChar32Start(fTestData.fString, i))) { 927 throw new MonkeyException("preceding of trailing surrogate error", i); 928 } 929 continue; 930 } 931 if (i == nextBreak && bk < nextBreak) { 932 fTestData.fActualBreaks[bk] = true; 933 nextBreak = bk; 934 continue; 935 } 936 throw new MonkeyException("preceding(i)", i); 937 } 938 checkResults("testPreceding", CheckDirection.REVERSE); 939 940 }; 941 942 943 void testIsBoundary() { 944 fTestData.clearActualBreaks(); 945 fBI.setText(fTestData.fString); 946 for (int i=fTestData.fString.length(); i>=0; --i) { 947 if (fBI.isBoundary(i)) { 948 fTestData.fActualBreaks[i] = true; 949 } 950 } 951 checkResults("testForwards", CheckDirection.FORWARD); 952 }; 953 954 955 void checkResults(String msg, CheckDirection direction) { 956 if (direction == CheckDirection.FORWARD) { 957 for (int i=0; i<=fTestData.fString.length(); ++i) { 958 if (fTestData.fExpectedBreaks[i] != fTestData.fActualBreaks[i]) { 959 throw new MonkeyException(msg, i); 960 } 961 } 962 } else { 963 for (int i=fTestData.fString.length(); i>=0; i--) { 964 if (fTestData.fExpectedBreaks[i] != fTestData.fActualBreaks[i]) { 965 throw new MonkeyException(msg, i); 966 } 967 } 968 } 969 970 }; 971 972 String fRuleCharBuffer; // source file contents of the reference rules. 973 BreakRules fRuleSet; 974 RuleBasedBreakIterator fBI; 975 MonkeyTestData fTestData; 976 ICU_Rand fRandomGenerator; 977 String fRuleFileName; 978 boolean fVerbose; // True to do long dump of failing data. 979 int fLoopCount; 980 int fErrorCount; 981 982 boolean fDumpExpansions; // Debug flag to output expanded form of rules and sets. 983 StringBuilder fErrorMsgs = new StringBuilder(); 984 985 } 986 987 // Test parameters, specified via Java properties. 988 // 989 // rules=file_name Name of file containing the reference rules. 990 // seed=nnnnn Random number starting seed. 991 // Setting the seed allows errors to be reproduced. 992 // loop=nnn Looping count. Controls running time. 993 // -1: run forever. 994 // 0 or greater: run length. 995 // expansions debug option, show expansions of rules and sets. 996 // verbose Display details of the failure. 997 // 998 // Parameters are passed to the JVM on the command line, or 999 // via the Eclipse Run Configuration settings, arguments tab, VM parameters. 1000 // For example, 1001 // -ea -Drules=line.txt -Dloop=-1 1002 // 1003 @Test 1004 public void TestMonkey() { 1005 String tests[] = {"grapheme.txt", "word.txt", "line.txt", "sentence.txt", "line_normal.txt", 1006 "line_normal_cj.txt", "line_loose.txt", "line_loose_cj.txt", "word_POSIX.txt" 1007 }; 1008 1009 String testNameFromParams = getProperty("rules"); 1010 1011 if (testNameFromParams != null) { 1012 tests = new String[] {testNameFromParams}; 1013 } 1014 1015 int loopCount = getIntProperty("loop", isQuick() ? 100 : 5000); 1016 boolean dumpExpansions = getBooleanProperty("expansions", false); 1017 boolean verbose = getBooleanProperty("verbose", false); 1018 int seed = getIntProperty("seed", 1); 1019 1020 List<RBBIMonkeyImpl> startedTests = new ArrayList<RBBIMonkeyImpl>(); 1021 1022 // Monkey testing is multi-threaded. 1023 // Each set of break rules to be tested is run in a separate thread. 1024 // Each thread/set of rules gets a separate RBBIMonkeyImpl object. 1025 1026 for (String testName: tests) { 1027 logln(String.format("beginning testing of %s", testName)); 1028 1029 RBBIMonkeyImpl test = new RBBIMonkeyImpl(); 1030 1031 test.fDumpExpansions = dumpExpansions; 1032 test.fVerbose = verbose; 1033 test.fRandomGenerator = new ICU_Rand(seed); 1034 test.fLoopCount = loopCount; 1035 test.setup(testName); 1036 1037 test.start(); 1038 startedTests.add(test); 1039 } 1040 1041 StringBuilder errors = new StringBuilder(); 1042 for (RBBIMonkeyImpl test: startedTests) { 1043 try { 1044 test.join(); 1045 errors.append(test.fErrorMsgs); 1046 } catch (InterruptedException e) { 1047 errors.append(e + "\n"); 1048 } 1049 } 1050 String errorMsgs = errors.toString(); 1051 assertEquals(errorMsgs, "", errorMsgs); 1052 1053 } 1054 1055 1056 } 1057