Home | History | Annotate | Download | only in i18n
      1 /*
      2 * Copyright (C) {1999-2001}, International Business Machines Corporation and others. All Rights Reserved.
      3 **********************************************************************
      4 *   Date        Name        Description
      5 *   11/17/99    aliu        Creation.
      6 **********************************************************************
      7 */
      8 #ifndef RBT_RULE_H
      9 #define RBT_RULE_H
     10 
     11 #include "unicode/utypes.h"
     12 
     13 #if !UCONFIG_NO_TRANSLITERATION
     14 
     15 #include "unicode/uobject.h"
     16 #include "unicode/unistr.h"
     17 #include "unicode/utrans.h"
     18 #include "unicode/unimatch.h"
     19 
     20 U_NAMESPACE_BEGIN
     21 
     22 class Replaceable;
     23 class TransliterationRuleData;
     24 class StringMatcher;
     25 class UnicodeFunctor;
     26 
     27 /**
     28  * A transliteration rule used by
     29  * <code>RuleBasedTransliterator</code>.
     30  * <code>TransliterationRule</code> is an immutable object.
     31  *
     32  * <p>A rule consists of an input pattern and an output string.  When
     33  * the input pattern is matched, the output string is emitted.  The
     34  * input pattern consists of zero or more characters which are matched
     35  * exactly (the key) and optional context.  Context must match if it
     36  * is specified.  Context may be specified before the key, after the
     37  * key, or both.  The key, preceding context, and following context
     38  * may contain variables.  Variables represent a set of Unicode
     39  * characters, such as the letters <i>a</i> through <i>z</i>.
     40  * Variables are detected by looking up each character in a supplied
     41  * variable list to see if it has been so defined.
     42  *
     43  * <p>A rule may contain segments in its input string and segment
     44  * references in its output string.  A segment is a substring of the
     45  * input pattern, indicated by an offset and limit.  The segment may
     46  * be in the preceding or following context.  It may not span a
     47  * context boundary.  A segment reference is a special character in
     48  * the output string that causes a segment of the input string (not
     49  * the input pattern) to be copied to the output string.  The range of
     50  * special characters that represent segment references is defined by
     51  * RuleBasedTransliterator.Data.
     52  *
     53  * @author Alan Liu
     54  */
     55 class TransliterationRule : public UMemory {
     56 
     57 private:
     58 
     59     // TODO Eliminate the pattern and keyLength data members.  They
     60     // are used only by masks() and getIndexValue() which are called
     61     // only during build time, not during run-time.  Perhaps these
     62     // methods and pattern/keyLength can be isolated into a separate
     63     // object.
     64 
     65     /**
     66      * The match that must occur before the key, or null if there is no
     67      * preceding context.
     68      */
     69     StringMatcher *anteContext;
     70 
     71     /**
     72      * The matcher object for the key.  If null, then the key is empty.
     73      */
     74     StringMatcher *key;
     75 
     76     /**
     77      * The match that must occur after the key, or null if there is no
     78      * following context.
     79      */
     80     StringMatcher *postContext;
     81 
     82     /**
     83      * The object that performs the replacement if the key,
     84      * anteContext, and postContext are matched.  Never null.
     85      */
     86     UnicodeFunctor* output;
     87 
     88     /**
     89      * The string that must be matched, consisting of the anteContext, key,
     90      * and postContext, concatenated together, in that order.  Some components
     91      * may be empty (zero length).
     92      * @see anteContextLength
     93      * @see keyLength
     94      */
     95     UnicodeString pattern;
     96 
     97     /**
     98      * An array of matcher objects corresponding to the input pattern
     99      * segments.  If there are no segments this is null.  N.B. This is
    100      * a UnicodeMatcher for generality, but in practice it is always a
    101      * StringMatcher.  In the future we may generalize this, but for
    102      * now we sometimes cast down to StringMatcher.
    103      *
    104      * The array is owned, but the pointers within it are not.
    105      */
    106     UnicodeFunctor** segments;
    107 
    108     /**
    109      * The number of elements in segments[] or zero if segments is NULL.
    110      */
    111     int32_t segmentsCount;
    112 
    113     /**
    114      * The length of the string that must match before the key.  If
    115      * zero, then there is no matching requirement before the key.
    116      * Substring [0,anteContextLength) of pattern is the anteContext.
    117      */
    118     int32_t anteContextLength;
    119 
    120     /**
    121      * The length of the key.  Substring [anteContextLength,
    122      * anteContextLength + keyLength) is the key.
    123 
    124      */
    125     int32_t keyLength;
    126 
    127     /**
    128      * Miscellaneous attributes.
    129      */
    130     int8_t flags;
    131 
    132     /**
    133      * Flag attributes.
    134      */
    135     enum {
    136         ANCHOR_START = 1,
    137         ANCHOR_END   = 2
    138     };
    139 
    140     /**
    141      * An alias pointer to the data for this rule.  The data provides
    142      * lookup services for matchers and segments.
    143      */
    144     const TransliterationRuleData* data;
    145 
    146 public:
    147 
    148     /**
    149      * Construct a new rule with the given input, output text, and other
    150      * attributes.  A cursor position may be specified for the output text.
    151      * @param input          input string, including key and optional ante and
    152      *                       post context.
    153      * @param anteContextPos offset into input to end of ante context, or -1 if
    154      *                       none.  Must be <= input.length() if not -1.
    155      * @param postContextPos offset into input to start of post context, or -1
    156      *                       if none.  Must be <= input.length() if not -1, and must be >=
    157      *                       anteContextPos.
    158      * @param outputStr      output string.
    159      * @param cursorPosition offset into output at which cursor is located, or -1 if
    160      *                       none.  If less than zero, then the cursor is placed after the
    161      *                       <code>output</code>; that is, -1 is equivalent to
    162      *                       <code>output.length()</code>.  If greater than
    163      *                       <code>output.length()</code> then an exception is thrown.
    164      * @param cursorOffset   an offset to be added to cursorPos to position the
    165      *                       cursor either in the ante context, if < 0, or in the post context, if >
    166      *                       0.  For example, the rule "abc{def} > | @@@ xyz;" changes "def" to
    167      *                       "xyz" and moves the cursor to before "a".  It would have a cursorOffset
    168      *                       of -3.
    169      * @param segs           array of UnicodeMatcher corresponding to input pattern
    170      *                       segments, or null if there are none.  The array itself is adopted,
    171      *                       but the pointers within it are not.
    172      * @param segsCount      number of elements in segs[].
    173      * @param anchorStart    TRUE if the the rule is anchored on the left to
    174      *                       the context start.
    175      * @param anchorEnd      TRUE if the rule is anchored on the right to the
    176      *                       context limit.
    177      * @param data           the rule data.
    178      * @param status         Output parameter filled in with success or failure status.
    179      */
    180     TransliterationRule(const UnicodeString& input,
    181                         int32_t anteContextPos, int32_t postContextPos,
    182                         const UnicodeString& outputStr,
    183                         int32_t cursorPosition, int32_t cursorOffset,
    184                         UnicodeFunctor** segs,
    185                         int32_t segsCount,
    186                         UBool anchorStart, UBool anchorEnd,
    187                         const TransliterationRuleData* data,
    188                         UErrorCode& status);
    189 
    190     /**
    191      * Copy constructor.
    192      * @param other    the object to be copied.
    193      */
    194     TransliterationRule(TransliterationRule& other);
    195 
    196     /**
    197      * Destructor.
    198      */
    199     virtual ~TransliterationRule();
    200 
    201     /**
    202      * Change the data object that this rule belongs to.  Used
    203      * internally by the TransliterationRuleData copy constructor.
    204      * @param data    the new data value to be set.
    205      */
    206     void setData(const TransliterationRuleData* data);
    207 
    208     /**
    209      * Return the preceding context length.  This method is needed to
    210      * support the <code>Transliterator</code> method
    211      * <code>getMaximumContextLength()</code>.  Internally, this is
    212      * implemented as the anteContextLength, optionally plus one if
    213      * there is a start anchor.  The one character anchor gap is
    214      * needed to make repeated incremental transliteration with
    215      * anchors work.
    216      * @return    the preceding context length.
    217      */
    218     virtual int32_t getContextLength(void) const;
    219 
    220     /**
    221      * Internal method.  Returns 8-bit index value for this rule.
    222      * This is the low byte of the first character of the key,
    223      * unless the first character of the key is a set.  If it's a
    224      * set, or otherwise can match multiple keys, the index value is -1.
    225      * @return    8-bit index value for this rule.
    226      */
    227     int16_t getIndexValue() const;
    228 
    229     /**
    230      * Internal method.  Returns true if this rule matches the given
    231      * index value.  The index value is an 8-bit integer, 0..255,
    232      * representing the low byte of the first character of the key.
    233      * It matches this rule if it matches the first character of the
    234      * key, or if the first character of the key is a set, and the set
    235      * contains any character with a low byte equal to the index
    236      * value.  If the rule contains only ante context, as in foo)>bar,
    237      * then it will match any key.
    238      * @param v    the given index value.
    239      * @return     true if this rule matches the given index value.
    240      */
    241     UBool matchesIndexValue(uint8_t v) const;
    242 
    243     /**
    244      * Return true if this rule masks another rule.  If r1 masks r2 then
    245      * r1 matches any input string that r2 matches.  If r1 masks r2 and r2 masks
    246      * r1 then r1 == r2.  Examples: "a>x" masks "ab>y".  "a>x" masks "a[b]>y".
    247      * "[c]a>x" masks "[dc]a>y".
    248      * @param r2  the given rule to be compared with.
    249      * @return    true if this rule masks 'r2'
    250      */
    251     virtual UBool masks(const TransliterationRule& r2) const;
    252 
    253     /**
    254      * Attempt a match and replacement at the given position.  Return
    255      * the degree of match between this rule and the given text.  The
    256      * degree of match may be mismatch, a partial match, or a full
    257      * match.  A mismatch means at least one character of the text
    258      * does not match the context or key.  A partial match means some
    259      * context and key characters match, but the text is not long
    260      * enough to match all of them.  A full match means all context
    261      * and key characters match.
    262      *
    263      * If a full match is obtained, perform a replacement, update pos,
    264      * and return U_MATCH.  Otherwise both text and pos are unchanged.
    265      *
    266      * @param text the text
    267      * @param pos the position indices
    268      * @param incremental if TRUE, test for partial matches that may
    269      * be completed by additional text inserted at pos.limit.
    270      * @return one of <code>U_MISMATCH</code>,
    271      * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>.  If
    272      * incremental is FALSE then U_PARTIAL_MATCH will not be returned.
    273      */
    274     UMatchDegree matchAndReplace(Replaceable& text,
    275                                  UTransPosition& pos,
    276                                  UBool incremental) const;
    277 
    278     /**
    279      * Create a rule string that represents this rule object.  Append
    280      * it to the given string.
    281      */
    282     virtual UnicodeString& toRule(UnicodeString& pat,
    283                                   UBool escapeUnprintable) const;
    284 
    285     /**
    286      * Union the set of all characters that may be modified by this rule
    287      * into the given set.
    288      */
    289     void addSourceSetTo(UnicodeSet& toUnionTo) const;
    290 
    291     /**
    292      * Union the set of all characters that may be emitted by this rule
    293      * into the given set.
    294      */
    295     void addTargetSetTo(UnicodeSet& toUnionTo) const;
    296 
    297  private:
    298 
    299     friend class StringMatcher;
    300 
    301     TransliterationRule &operator=(const TransliterationRule &other); // forbid copying of this class
    302 };
    303 
    304 U_NAMESPACE_END
    305 
    306 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
    307 
    308 #endif
    309