Home | History | Annotate | Download | only in intltest
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*************************************************************************
      4  * Copyright (c) 2016, International Business Machines
      5  * Corporation and others. All Rights Reserved.
      6  *************************************************************************
      7 */
      8 #ifndef RBBIMONKEYTEST_H
      9 #define RBBIMONKEYTEST_H
     10 
     11 #include "unicode/utypes.h"
     12 
     13 #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING
     14 
     15 #include "intltest.h"
     16 
     17 #include "unicode/rbbi.h"
     18 #include "unicode/regex.h"
     19 #include "unicode/uniset.h"
     20 #include "unicode/unistr.h"
     21 #include "unicode/uobject.h"
     22 
     23 #include "simplethread.h"
     24 #include "ucbuf.h"
     25 #include "uhash.h"
     26 #include "uvector.h"
     27 
     28 // RBBI Monkey Test. Run break iterators against randomly generated strings, compare results with
     29 //                   an independent reference implementation.
     30 //
     31 //         The monkey test can be run with parameters, e.g.
     32 //              intltest rbbi/RBBIMonkeyTest@loop=-1,rules=word.txt
     33 //         will run word break testing in an infinite loop.
     34 //         Summary of options
     35 //               rules=name             Test against the named reference rule file.
     36 //                                     Files are found in source/test/testdata/break_rules
     37 //               loop=nnn              Loop nnn times. -1 for no limit. loop of 1 is useful for debugging.
     38 //               seed=nnnn             Random number generator seed. Allows recreation of a failure.
     39 //                                     Error messages include the necessary seed value.
     40 //               verbose               Display details of a failure. Useful for debugging. Use with loop=1.
     41 //               expansions            Debug option, show expansions of rules and sets.
     42 //
     43 //  TODO:
     44 //     Develop a tailoring format.
     45 //     Hook to old tests that use monkey impl to get expected data.
     46 //     Remove old tests.
     47 
     48 class BreakRules;       // Forward declaration
     49 class RBBIMonkeyImpl;
     50 
     51 /**
     52  * Test the RuleBasedBreakIterator class giving different rules
     53  */
     54 class RBBIMonkeyTest: public IntlTest {
     55   public:
     56     RBBIMonkeyTest();
     57     virtual ~RBBIMonkeyTest();
     58 
     59     void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL );
     60     void testMonkey();
     61 
     62 
     63   private:
     64     const char *fParams;                  // Copy of user parameters passed in from IntlTest.
     65 
     66 
     67     void testRules(const char *ruleFile);
     68     static UBool getIntParam(UnicodeString name, UnicodeString &params, int64_t &val, UErrorCode &status);
     69     static UBool getStringParam(UnicodeString name, UnicodeString &params, CharString &dest, UErrorCode &status);
     70     static UBool getBoolParam(UnicodeString name, UnicodeString &params, UBool &dest, UErrorCode &status);
     71 
     72 };
     73 
     74 // The following classes are internal to the RBBI Monkey Test implementation.
     75 
     76 
     77 
     78 //  class CharClass    Represents a single character class from the source break rules.
     79 //                     Inherits from UObject because instances are adopted by UHashtable, which ultimately
     80 //                     deletes them using hash's object deleter function.
     81 
     82 class CharClass: public UObject {
     83   public:
     84     UnicodeString                fName;
     85     UnicodeString                fOriginalDef;    // set definition as it appeared in user supplied rules.
     86     UnicodeString                fExpandedDef;    // set definition with any embedded named sets replaced by their defs, recursively.
     87     LocalPointer<const UnicodeSet>     fSet;
     88     CharClass(const UnicodeString &name, const UnicodeString &originalDef, const UnicodeString &expandedDef, const UnicodeSet *set) :
     89             fName(name), fOriginalDef(originalDef), fExpandedDef(expandedDef), fSet(set) {}
     90 };
     91 
     92 
     93 // class BreakRule    represents a single rule from a set of break rules.
     94 //                    Each rule has the set definitions expanded, and
     95 //                    is compiled to a regular expression.
     96 
     97 class BreakRule: public UObject {
     98   public:
     99     BreakRule();
    100     ~BreakRule();
    101     UnicodeString    fName;                            // Name of the rule.
    102     UnicodeString    fRule;                            // Rule expression, excluding the name, as written in user source.
    103     UnicodeString    fExpandedRule;                    // Rule expression after expanding the set definitions.
    104     LocalPointer<RegexMatcher>  fRuleMatcher;          // Regular expression that matches the rule.
    105     bool             fInitialMatchOnly = false;        // True if rule begins with '^', meaning no chaining.
    106 };
    107 
    108 
    109 // class BreakRules    represents a complete set of break rules, possibly tailored,
    110 //                     compiled from testdata break rules.
    111 
    112 class BreakRules: public UObject {
    113   public:
    114     BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status);
    115     ~BreakRules();
    116 
    117     void compileRules(UCHARBUF *rules, UErrorCode &status);
    118 
    119     const CharClass *getClassForChar(UChar32 c, int32_t *iter=NULL) const;
    120 
    121 
    122     RBBIMonkeyImpl    *fMonkeyImpl;        // Pointer back to the owning MonkeyImpl instance.
    123     icu::UVector       fBreakRules;        // Contents are of type (BreakRule *).
    124 
    125     LocalUHashtablePointer fCharClasses;   // Key is set name (UnicodeString).
    126                                            // Value is (CharClass *)
    127     LocalPointer<UVector>  fCharClassList; // Char Classes, same contents as fCharClasses values,
    128                                            //   but in a vector so they can be accessed by index.
    129     UnicodeSet         fDictionarySet;     // Dictionary set, empty if none is defined.
    130     Locale             fLocale;
    131     UBreakIteratorType fType;
    132 
    133     CharClass *addCharClass(const UnicodeString &name, const UnicodeString &def, UErrorCode &status);
    134     void addRule(const UnicodeString &name, const UnicodeString &def, UErrorCode &status);
    135     bool setKeywordParameter(const UnicodeString &keyword, const UnicodeString &value, UErrorCode &status);
    136     RuleBasedBreakIterator *createICUBreakIterator(UErrorCode &status);
    137 
    138     LocalPointer<RegexMatcher> fSetRefsMatcher;
    139     LocalPointer<RegexMatcher> fCommentsMatcher;
    140     LocalPointer<RegexMatcher> fClassDefMatcher;
    141     LocalPointer<RegexMatcher> fRuleDefMatcher;
    142 };
    143 
    144 
    145 // class MonkeyTestData    represents a randomly synthesized test data string together
    146 //                         with the expected break positions obtained by applying
    147 //                         the test break rules.
    148 
    149 class MonkeyTestData: public UObject {
    150   public:
    151     MonkeyTestData() {};
    152     ~MonkeyTestData() {};
    153     void set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode &status);
    154     void clearActualBreaks();
    155     void dump(int32_t around = -1) const;
    156 
    157     uint32_t               fRandomSeed;        // The initial seed value from the random number genererator.
    158     const BreakRules      *fBkRules;           // The break rules used to generate this data.
    159     UnicodeString          fString;            // The text.
    160     UnicodeString          fExpectedBreaks;    // Breaks as found by the reference rules.
    161                                                //     Parallel to fString. Non-zero if break preceding.
    162     UnicodeString          fActualBreaks;      // Breaks as found by ICU break iterator.
    163     UnicodeString          fRuleForPosition;   // Index into BreakRules.fBreakRules of rule that applied at each position.
    164                                                // Also parallel to fString.
    165     UnicodeString          f2ndRuleForPos;     // As above. A 2nd rule applies when the preceding rule
    166                                                //   didn't cause a break, and a subsequent rule match starts
    167                                                //   on the last code point of the preceding match.
    168 
    169 };
    170 
    171 
    172 
    173 
    174 // class RBBIMonkeyImpl     holds (some indirectly) everything associated with running a monkey
    175 //                          test for one set of break rules.
    176 //
    177 //                          When running RBBIMonkeyTest with multiple threads, there is a 1:1 correspondence
    178 //                          between instances of RBBIMonkeyImpl and threads.
    179 //
    180 class RBBIMonkeyImpl: public UObject {
    181   public:
    182     RBBIMonkeyImpl(UErrorCode &status);
    183     ~RBBIMonkeyImpl();
    184 
    185     void setup(const char *ruleFileName, UErrorCode &status);
    186 
    187     void startTest();
    188     void runTest();
    189     void join();
    190 
    191     LocalUCHARBUFPointer                 fRuleCharBuffer;         // source file contents of the reference rules.
    192     LocalPointer<BreakRules>             fRuleSet;
    193     LocalPointer<RuleBasedBreakIterator> fBI;
    194     LocalPointer<MonkeyTestData>         fTestData;
    195     IntlTest::icu_rand                   fRandomGenerator;
    196     const char                          *fRuleFileName;
    197     UBool                                fVerbose;                 // True to do long dump of failing data.
    198     int32_t                              fLoopCount;
    199 
    200     UBool                                fDumpExpansions;          // Debug flag to output epananded form of rules and sets.
    201 
    202     enum CheckDirection {
    203         FORWARD = 1,
    204         REVERSE = 2
    205     };
    206     void clearActualBreaks();
    207     void testForwards(UErrorCode &status);
    208     void testPrevious(UErrorCode &status);
    209     void testFollowing(UErrorCode &status);
    210     void testPreceding(UErrorCode &status);
    211     void testIsBoundary(UErrorCode &status);
    212     void testIsBoundaryRandom(UErrorCode &status);
    213     void checkResults(const char *msg, CheckDirection dir, UErrorCode &status);
    214 
    215     class RBBIMonkeyThread: public SimpleThread {
    216       private:
    217         RBBIMonkeyImpl *fMonkeyImpl;
    218       public:
    219         RBBIMonkeyThread(RBBIMonkeyImpl *impl) : fMonkeyImpl(impl) {};
    220         void run() U_OVERRIDE { fMonkeyImpl->runTest(); };
    221     };
    222   private:
    223     void openBreakRules(const char *fileName, UErrorCode &status);
    224     RBBIMonkeyThread fThread;
    225 
    226 };
    227 
    228 #endif /* !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING */
    229 
    230 #endif  //  RBBIMONKEYTEST_H
    231