Home | History | Annotate | Download | only in i18n
      1 /*
      2  * Copyright (C) 2001-2011, International Business Machines Corporation
      3  * and others. All Rights Reserved.
      4  **********************************************************************
      5  *   Date        Name        Description
      6  *   07/23/01    aliu        Creation.
      7  **********************************************************************
      8  */
      9 #ifndef STRMATCH_H
     10 #define STRMATCH_H
     11 
     12 #include "unicode/utypes.h"
     13 
     14 #if !UCONFIG_NO_TRANSLITERATION
     15 
     16 #include "unicode/unistr.h"
     17 #include "unicode/unifunct.h"
     18 #include "unicode/unimatch.h"
     19 #include "unicode/unirepl.h"
     20 
     21 U_NAMESPACE_BEGIN
     22 
     23 class TransliterationRuleData;
     24 
     25 /**
     26  * An object that matches a fixed input string, implementing the
     27  * UnicodeMatcher API.  This object also implements the
     28  * UnicodeReplacer API, allowing it to emit the matched text as
     29  * output.  Since the match text may contain flexible match elements,
     30  * such as UnicodeSets, the emitted text is not the match pattern, but
     31  * instead a substring of the actual matched text.  Following
     32  * convention, the output text is the leftmost match seen up to this
     33  * point.
     34  *
     35  * A StringMatcher may represent a segment, in which case it has a
     36  * positive segment number.  This affects how the matcher converts
     37  * itself to a pattern but does not otherwise affect its function.
     38  *
     39  * A StringMatcher that is not a segment should not be used as a
     40  * UnicodeReplacer.
     41  */
     42 class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer {
     43 
     44  public:
     45 
     46     /**
     47      * Construct a matcher that matches the given pattern string.
     48      * @param string the pattern to be matched, possibly containing
     49      * stand-ins that represent nested UnicodeMatcher objects.
     50      * @param start inclusive start index of text to be replaced
     51      * @param limit exclusive end index of text to be replaced;
     52      * must be greater than or equal to start
     53      * @param segmentNum the segment number from 1..n, or 0 if this is
     54      * not a segment.
     55      * @param data context object mapping stand-ins to
     56      * UnicodeMatcher objects.
     57      */
     58     StringMatcher(const UnicodeString& string,
     59                   int32_t start,
     60                   int32_t limit,
     61                   int32_t segmentNum,
     62                   const TransliterationRuleData& data);
     63 
     64     /**
     65      * Copy constructor
     66      * @param o  the object to be copied.
     67      */
     68     StringMatcher(const StringMatcher& o);
     69 
     70     /**
     71      * Destructor
     72      */
     73     virtual ~StringMatcher();
     74 
     75     /**
     76      * Implement UnicodeFunctor
     77      * @return a copy of the object.
     78      */
     79     virtual UnicodeFunctor* clone() const;
     80 
     81     /**
     82      * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
     83      * and return the pointer.
     84      * @return the UnicodeMatcher point.
     85      */
     86     virtual UnicodeMatcher* toMatcher() const;
     87 
     88     /**
     89      * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
     90      * and return the pointer.
     91      * @return the UnicodeReplacer pointer.
     92      */
     93     virtual UnicodeReplacer* toReplacer() const;
     94 
     95     /**
     96      * Implement UnicodeMatcher
     97      * @param text the text to be matched
     98      * @param offset on input, the index into text at which to begin
     99      * matching.  On output, the limit of the matched text.  The
    100      * number of matched characters is the output value of offset
    101      * minus the input value.  Offset should always point to the
    102      * HIGH SURROGATE (leading code unit) of a pair of surrogates,
    103      * both on entry and upon return.
    104      * @param limit the limit index of text to be matched.  Greater
    105      * than offset for a forward direction match, less than offset for
    106      * a backward direction match.  The last character to be
    107      * considered for matching will be text.charAt(limit-1) in the
    108      * forward direction or text.charAt(limit+1) in the backward
    109      * direction.
    110      * @param incremental  if TRUE, then assume further characters may
    111      * be inserted at limit and check for partial matching.  Otherwise
    112      * assume the text as given is complete.
    113      * @return a match degree value indicating a full match, a partial
    114      * match, or a mismatch.  If incremental is FALSE then
    115      * U_PARTIAL_MATCH should never be returned.
    116      */
    117     virtual UMatchDegree matches(const Replaceable& text,
    118                                  int32_t& offset,
    119                                  int32_t limit,
    120                                  UBool incremental);
    121 
    122     /**
    123      * Implement UnicodeMatcher
    124      * @param result            Output param to receive the pattern.
    125      * @param escapeUnprintable if True then escape the unprintable characters.
    126      * @return                  A reference to 'result'.
    127      */
    128     virtual UnicodeString& toPattern(UnicodeString& result,
    129                                      UBool escapeUnprintable = FALSE) const;
    130 
    131     /**
    132      * Implement UnicodeMatcher
    133      * Returns TRUE if this matcher will match a character c, where c
    134      * & 0xFF == v, at offset, in the forward direction (with limit >
    135      * offset).  This is used by <tt>RuleBasedTransliterator</tt> for
    136      * indexing.
    137      * @param v    the given value
    138      * @return     TRUE if this matcher will match a character c,
    139      *             where c & 0xFF == v
    140      */
    141     virtual UBool matchesIndexValue(uint8_t v) const;
    142 
    143     /**
    144      * Implement UnicodeMatcher
    145      */
    146     virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
    147 
    148     /**
    149      * Implement UnicodeFunctor
    150      */
    151     virtual void setData(const TransliterationRuleData*);
    152 
    153     /**
    154      * Replace characters in 'text' from 'start' to 'limit' with the
    155      * output text of this object.  Update the 'cursor' parameter to
    156      * give the cursor position and return the length of the
    157      * replacement text.
    158      *
    159      * @param text the text to be matched
    160      * @param start inclusive start index of text to be replaced
    161      * @param limit exclusive end index of text to be replaced;
    162      * must be greater than or equal to start
    163      * @param cursor output parameter for the cursor position.
    164      * Not all replacer objects will update this, but in a complete
    165      * tree of replacer objects, representing the entire output side
    166      * of a transliteration rule, at least one must update it.
    167      * @return the number of 16-bit code units in the text replacing
    168      * the characters at offsets start..(limit-1) in text
    169      */
    170     virtual int32_t replace(Replaceable& text,
    171                             int32_t start,
    172                             int32_t limit,
    173                             int32_t& cursor);
    174 
    175     /**
    176      * Returns a string representation of this replacer.  If the
    177      * result of calling this function is passed to the appropriate
    178      * parser, typically TransliteratorParser, it will produce another
    179      * replacer that is equal to this one.
    180      * @param result the string to receive the pattern.  Previous
    181      * contents will be deleted.
    182      * @param escapeUnprintable if TRUE then convert unprintable
    183      * character to their hex escape representations, \\uxxxx or
    184      * \\Uxxxxxxxx.  Unprintable characters are defined by
    185      * Utility.isUnprintable().
    186      * @return a reference to 'result'.
    187      */
    188     virtual UnicodeString& toReplacerPattern(UnicodeString& result,
    189                                              UBool escapeUnprintable) const;
    190 
    191     /**
    192      * Remove any match data.  This must be called before performing a
    193      * set of matches with this segment.
    194      */
    195     void resetMatch();
    196 
    197     /**
    198      * ICU "poor man's RTTI", returns a UClassID for the actual class.
    199      */
    200     virtual UClassID getDynamicClassID() const;
    201 
    202     /**
    203      * ICU "poor man's RTTI", returns a UClassID for this class.
    204      */
    205     static UClassID U_EXPORT2 getStaticClassID();
    206 
    207     /**
    208      * Union the set of all characters that may output by this object
    209      * into the given set.
    210      * @param toUnionTo the set into which to union the output characters
    211      */
    212     virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const;
    213 
    214  private:
    215 
    216     /**
    217      * The text to be matched.
    218      */
    219     UnicodeString pattern;
    220 
    221     /**
    222      * Context object that maps stand-ins to matcher and replacer
    223      * objects.
    224      */
    225     const TransliterationRuleData* data;
    226 
    227     /**
    228      * The segment number, 1-based, or 0 if not a segment.
    229      */
    230     int32_t segmentNumber;
    231 
    232     /**
    233      * Start offset, in the match text, of the <em>rightmost</em>
    234      * match.
    235      */
    236     int32_t matchStart;
    237 
    238     /**
    239      * Limit offset, in the match text, of the <em>rightmost</em>
    240      * match.
    241      */
    242     int32_t matchLimit;
    243 
    244 };
    245 
    246 U_NAMESPACE_END
    247 
    248 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
    249 
    250 #endif
    251