Home | History | Annotate | Download | only in i18n
      1 /*
      2 **********************************************************************
      3 * Copyright (C) 1999-2011, International Business Machines Corporation
      4 * and others. All Rights Reserved.
      5 **********************************************************************
      6 *   Date        Name        Description
      7 *   11/17/99    aliu        Creation.
      8 **********************************************************************
      9 */
     10 #ifndef RBT_PARS_H
     11 #define RBT_PARS_H
     12 
     13 #include "unicode/utypes.h"
     14 
     15 #if !UCONFIG_NO_TRANSLITERATION
     16 #ifdef __cplusplus
     17 
     18 #include "unicode/uobject.h"
     19 #include "unicode/parseerr.h"
     20 #include "unicode/unorm.h"
     21 #include "rbt.h"
     22 #include "hash.h"
     23 #include "uvector.h"
     24 
     25 U_NAMESPACE_BEGIN
     26 
     27 class TransliterationRuleData;
     28 class UnicodeFunctor;
     29 class ParseData;
     30 class RuleHalf;
     31 class ParsePosition;
     32 class StringMatcher;
     33 
     34 class TransliteratorParser : public UMemory {
     35 
     36  public:
     37 
     38     /**
     39      * A Vector of TransliterationRuleData objects, one for each discrete group
     40      * of rules in the rule set
     41      */
     42     UVector dataVector;
     43 
     44     /**
     45      * PUBLIC data member.
     46      * A Vector of UnicodeStrings containing all of the ID blocks in the rule set
     47      */
     48     UVector idBlockVector;
     49 
     50     /**
     51      * PUBLIC data member containing the parsed compound filter, if any.
     52      */
     53     UnicodeSet* compoundFilter;
     54 
     55  private:
     56 
     57     /**
     58      * The current data object for which we are parsing rules
     59      */
     60     TransliterationRuleData* curData;
     61 
     62     UTransDirection direction;
     63 
     64     /**
     65      * Parse error information.
     66      */
     67     UParseError parseError;
     68 
     69     /**
     70      * Temporary symbol table used during parsing.
     71      */
     72     ParseData* parseData;
     73 
     74     /**
     75      * Temporary vector of matcher variables.  When parsing is complete, this
     76      * is copied into the array data.variables.  As with data.variables,
     77      * element 0 corresponds to character data.variablesBase.
     78      */
     79     UVector variablesVector;
     80 
     81     /**
     82      * Temporary table of variable names.  When parsing is complete, this is
     83      * copied into data.variableNames.
     84      */
     85     Hashtable variableNames;
     86 
     87     /**
     88      * String of standins for segments.  Used during the parsing of a single
     89      * rule.  segmentStandins.charAt(0) is the standin for "$1" and corresponds
     90      * to StringMatcher object segmentObjects.elementAt(0), etc.
     91      */
     92     UnicodeString segmentStandins;
     93 
     94     /**
     95      * Vector of StringMatcher objects for segments.  Used during the
     96      * parsing of a single rule.
     97      * segmentStandins.charAt(0) is the standin for "$1" and corresponds
     98      * to StringMatcher object segmentObjects.elementAt(0), etc.
     99      */
    100     UVector segmentObjects;
    101 
    102     /**
    103      * The next available stand-in for variables.  This starts at some point in
    104      * the private use area (discovered dynamically) and increments up toward
    105      * <code>variableLimit</code>.  At any point during parsing, available
    106      * variables are <code>variableNext..variableLimit-1</code>.
    107      */
    108     UChar variableNext;
    109 
    110     /**
    111      * The last available stand-in for variables.  This is discovered
    112      * dynamically.  At any point during parsing, available variables are
    113      * <code>variableNext..variableLimit-1</code>.
    114      */
    115     UChar variableLimit;
    116 
    117     /**
    118      * When we encounter an undefined variable, we do not immediately signal
    119      * an error, in case we are defining this variable, e.g., "$a = [a-z];".
    120      * Instead, we save the name of the undefined variable, and substitute
    121      * in the placeholder char variableLimit - 1, and decrement
    122      * variableLimit.
    123      */
    124     UnicodeString undefinedVariableName;
    125 
    126     /**
    127      * The stand-in character for the 'dot' set, represented by '.' in
    128      * patterns.  This is allocated the first time it is needed, and
    129      * reused thereafter.
    130      */
    131     UChar dotStandIn;
    132 
    133 public:
    134 
    135     /**
    136      * Constructor.
    137      */
    138     TransliteratorParser(UErrorCode &statusReturn);
    139 
    140     /**
    141      * Destructor.
    142      */
    143     ~TransliteratorParser();
    144 
    145     /**
    146      * Parse the given string as a sequence of rules, separated by newline
    147      * characters ('\n'), and cause this object to implement those rules.  Any
    148      * previous rules are discarded.  Typically this method is called exactly
    149      * once after construction.
    150      *
    151      * Parse the given rules, in the given direction.  After this call
    152      * returns, query the public data members for results.  The caller
    153      * owns the 'data' and 'compoundFilter' data members after this
    154      * call returns.
    155      * @param rules      rules, separated by ';'
    156      * @param direction  either FORWARD or REVERSE.
    157      * @param pe         Struct to recieve information on position
    158      *                   of error if an error is encountered
    159      * @param ec         Output param set to success/failure code.
    160      */
    161     void parse(const UnicodeString& rules,
    162                UTransDirection direction,
    163                UParseError& pe,
    164                UErrorCode& ec);
    165 
    166     /**
    167      * Return the compound filter parsed by parse().  Caller owns result.
    168      * @return the compound filter parsed by parse().
    169      */
    170     UnicodeSet* orphanCompoundFilter();
    171 
    172 private:
    173 
    174     /**
    175      * Return a representation of this transliterator as source rules.
    176      * @param rules      Output param to receive the rules.
    177      * @param direction  either FORWARD or REVERSE.
    178      */
    179     void parseRules(const UnicodeString& rules,
    180                     UTransDirection direction,
    181                     UErrorCode& status);
    182 
    183     /**
    184      * MAIN PARSER.  Parse the next rule in the given rule string, starting
    185      * at pos.  Return the index after the last character parsed.  Do not
    186      * parse characters at or after limit.
    187      *
    188      * Important:  The character at pos must be a non-whitespace character
    189      * that is not the comment character.
    190      *
    191      * This method handles quoting, escaping, and whitespace removal.  It
    192      * parses the end-of-rule character.  It recognizes context and cursor
    193      * indicators.  Once it does a lexical breakdown of the rule at pos, it
    194      * creates a rule object and adds it to our rule list.
    195      * @param rules      Output param to receive the rules.
    196      * @param pos        the starting position.
    197      * @param limit      pointer past the last character of the rule.
    198      * @return           the index after the last character parsed.
    199      */
    200     int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
    201 
    202     /**
    203      * Set the variable range to [start, end] (inclusive).
    204      * @param start    the start value of the range.
    205      * @param end      the end value of the range.
    206      */
    207     void setVariableRange(int32_t start, int32_t end, UErrorCode& status);
    208 
    209     /**
    210      * Assert that the given character is NOT within the variable range.
    211      * If it is, return FALSE.  This is neccesary to ensure that the
    212      * variable range does not overlap characters used in a rule.
    213      * @param ch     the given character.
    214      * @return       True, if the given character is NOT within the variable range.
    215      */
    216     UBool checkVariableRange(UChar32 ch) const;
    217 
    218     /**
    219      * Set the maximum backup to 'backup', in response to a pragma
    220      * statement.
    221      * @param backup    the new value to be set.
    222      */
    223     void pragmaMaximumBackup(int32_t backup);
    224 
    225     /**
    226      * Begin normalizing all rules using the given mode, in response
    227      * to a pragma statement.
    228      * @param mode    the given mode.
    229      */
    230     void pragmaNormalizeRules(UNormalizationMode mode);
    231 
    232     /**
    233      * Return true if the given rule looks like a pragma.
    234      * @param pos offset to the first non-whitespace character
    235      * of the rule.
    236      * @param limit pointer past the last character of the rule.
    237      * @return true if the given rule looks like a pragma.
    238      */
    239     static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit);
    240 
    241     /**
    242      * Parse a pragma.  This method assumes resemblesPragma() has
    243      * already returned true.
    244      * @param pos offset to the first non-whitespace character
    245      * of the rule.
    246      * @param limit pointer past the last character of the rule.
    247      * @return the position index after the final ';' of the pragma,
    248      * or -1 on failure.
    249      */
    250     int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
    251 
    252     /**
    253      * Called by main parser upon syntax error.  Search the rule string
    254      * for the probable end of the rule.  Of course, if the error is that
    255      * the end of rule marker is missing, then the rule end will not be found.
    256      * In any case the rule start will be correctly reported.
    257      * @param parseErrorCode error code.
    258      * @param msg error description.
    259      * @param start position of first character of current rule.
    260      * @return start position of first character of current rule.
    261      */
    262     int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start,
    263                         UErrorCode& status);
    264 
    265     /**
    266      * Parse a UnicodeSet out, store it, and return the stand-in character
    267      * used to represent it.
    268      *
    269      * @param rule    the rule for UnicodeSet.
    270      * @param pos     the position in pattern at which to start parsing.
    271      * @return        the stand-in character used to represent it.
    272      */
    273     UChar parseSet(const UnicodeString& rule,
    274                    ParsePosition& pos,
    275                    UErrorCode& status);
    276 
    277     /**
    278      * Generate and return a stand-in for a new UnicodeFunctor.  Store
    279      * the matcher (adopt it).
    280      * @param adopted the UnicodeFunctor to be adopted.
    281      * @return        a stand-in for a new UnicodeFunctor.
    282      */
    283     UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status);
    284 
    285     /**
    286      * Return the standin for segment seg (1-based).
    287      * @param seg    the given segment.
    288      * @return       the standIn character for the given segment.
    289      */
    290     UChar getSegmentStandin(int32_t seg, UErrorCode& status);
    291 
    292     /**
    293      * Set the object for segment seg (1-based).
    294      * @param seg      the given segment.
    295      * @param adopted  the StringMatcher to be adopted.
    296      */
    297     void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status);
    298 
    299     /**
    300      * Return the stand-in for the dot set.  It is allocated the first
    301      * time and reused thereafter.
    302      * @return    the stand-in for the dot set.
    303      */
    304     UChar getDotStandIn(UErrorCode& status);
    305 
    306     /**
    307      * Append the value of the given variable name to the given
    308      * UnicodeString.
    309      * @param name    the variable name to be appended.
    310      * @param buf     the given UnicodeString to append to.
    311      */
    312     void appendVariableDef(const UnicodeString& name,
    313                            UnicodeString& buf,
    314                            UErrorCode& status);
    315 
    316     /**
    317      * Glue method to get around access restrictions in C++.
    318      */
    319     /*static Transliterator* createBasicInstance(const UnicodeString& id,
    320                                                const UnicodeString* canonID);*/
    321 
    322     friend class RuleHalf;
    323 
    324     // Disallowed methods; no impl.
    325     /**
    326      * Copy constructor
    327      */
    328     TransliteratorParser(const TransliteratorParser&);
    329 
    330     /**
    331      * Assignment operator
    332      */
    333     TransliteratorParser& operator=(const TransliteratorParser&);
    334 };
    335 
    336 U_NAMESPACE_END
    337 
    338 #endif /* #ifdef __cplusplus */
    339 
    340 /**
    341  * Strip/convert the following from the transliterator rules:
    342  * comments
    343  * newlines
    344  * white space at the beginning and end of a line
    345  * unescape \u notation
    346  *
    347  * The target must be equal in size as the source.
    348  * @internal
    349  */
    350 U_CAPI int32_t
    351 utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status);
    352 
    353 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
    354 
    355 #endif
    356