Home | History | Annotate | Download | only in i18n
      1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4  * Copyright (C) 2001-2011, International Business Machines Corporation
      5  * and others. All Rights Reserved.
      6  **********************************************************************
      7  *   Date        Name        Description
      8  *   07/23/01    aliu        Creation.
      9  **********************************************************************
     10  */
     11 #ifndef STRMATCH_H
     12 #define STRMATCH_H
     13 
     14 #include "unicode/utypes.h"
     15 
     16 #if !UCONFIG_NO_TRANSLITERATION
     17 
     18 #include "unicode/unistr.h"
     19 #include "unicode/unifunct.h"
     20 #include "unicode/unimatch.h"
     21 #include "unicode/unirepl.h"
     22 
     23 U_NAMESPACE_BEGIN
     24 
     25 class TransliterationRuleData;
     26 
     27 /**
     28  * An object that matches a fixed input string, implementing the
     29  * UnicodeMatcher API.  This object also implements the
     30  * UnicodeReplacer API, allowing it to emit the matched text as
     31  * output.  Since the match text may contain flexible match elements,
     32  * such as UnicodeSets, the emitted text is not the match pattern, but
     33  * instead a substring of the actual matched text.  Following
     34  * convention, the output text is the leftmost match seen up to this
     35  * point.
     36  *
     37  * A StringMatcher may represent a segment, in which case it has a
     38  * positive segment number.  This affects how the matcher converts
     39  * itself to a pattern but does not otherwise affect its function.
     40  *
     41  * A StringMatcher that is not a segment should not be used as a
     42  * UnicodeReplacer.
     43  */
     44 class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer {
     45 
     46  public:
     47 
     48     /**
     49      * Construct a matcher that matches the given pattern string.
     50      * @param string the pattern to be matched, possibly containing
     51      * stand-ins that represent nested UnicodeMatcher objects.
     52      * @param start inclusive start index of text to be replaced
     53      * @param limit exclusive end index of text to be replaced;
     54      * must be greater than or equal to start
     55      * @param segmentNum the segment number from 1..n, or 0 if this is
     56      * not a segment.
     57      * @param data context object mapping stand-ins to
     58      * UnicodeMatcher objects.
     59      */
     60     StringMatcher(const UnicodeString& string,
     61                   int32_t start,
     62                   int32_t limit,
     63                   int32_t segmentNum,
     64                   const TransliterationRuleData& data);
     65 
     66     /**
     67      * Copy constructor
     68      * @param o  the object to be copied.
     69      */
     70     StringMatcher(const StringMatcher& o);
     71 
     72     /**
     73      * Destructor
     74      */
     75     virtual ~StringMatcher();
     76 
     77     /**
     78      * Implement UnicodeFunctor
     79      * @return a copy of the object.
     80      */
     81     virtual UnicodeFunctor* clone() const;
     82 
     83     /**
     84      * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
     85      * and return the pointer.
     86      * @return the UnicodeMatcher point.
     87      */
     88     virtual UnicodeMatcher* toMatcher() const;
     89 
     90     /**
     91      * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
     92      * and return the pointer.
     93      * @return the UnicodeReplacer pointer.
     94      */
     95     virtual UnicodeReplacer* toReplacer() const;
     96 
     97     /**
     98      * Implement UnicodeMatcher
     99      * @param text the text to be matched
    100      * @param offset on input, the index into text at which to begin
    101      * matching.  On output, the limit of the matched text.  The
    102      * number of matched characters is the output value of offset
    103      * minus the input value.  Offset should always point to the
    104      * HIGH SURROGATE (leading code unit) of a pair of surrogates,
    105      * both on entry and upon return.
    106      * @param limit the limit index of text to be matched.  Greater
    107      * than offset for a forward direction match, less than offset for
    108      * a backward direction match.  The last character to be
    109      * considered for matching will be text.charAt(limit-1) in the
    110      * forward direction or text.charAt(limit+1) in the backward
    111      * direction.
    112      * @param incremental  if TRUE, then assume further characters may
    113      * be inserted at limit and check for partial matching.  Otherwise
    114      * assume the text as given is complete.
    115      * @return a match degree value indicating a full match, a partial
    116      * match, or a mismatch.  If incremental is FALSE then
    117      * U_PARTIAL_MATCH should never be returned.
    118      */
    119     virtual UMatchDegree matches(const Replaceable& text,
    120                                  int32_t& offset,
    121                                  int32_t limit,
    122                                  UBool incremental);
    123 
    124     /**
    125      * Implement UnicodeMatcher
    126      * @param result            Output param to receive the pattern.
    127      * @param escapeUnprintable if True then escape the unprintable characters.
    128      * @return                  A reference to 'result'.
    129      */
    130     virtual UnicodeString& toPattern(UnicodeString& result,
    131                                      UBool escapeUnprintable = FALSE) const;
    132 
    133     /**
    134      * Implement UnicodeMatcher
    135      * Returns TRUE if this matcher will match a character c, where c
    136      * & 0xFF == v, at offset, in the forward direction (with limit >
    137      * offset).  This is used by <tt>RuleBasedTransliterator</tt> for
    138      * indexing.
    139      * @param v    the given value
    140      * @return     TRUE if this matcher will match a character c,
    141      *             where c & 0xFF == v
    142      */
    143     virtual UBool matchesIndexValue(uint8_t v) const;
    144 
    145     /**
    146      * Implement UnicodeMatcher
    147      */
    148     virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
    149 
    150     /**
    151      * Implement UnicodeFunctor
    152      */
    153     virtual void setData(const TransliterationRuleData*);
    154 
    155     /**
    156      * Replace characters in 'text' from 'start' to 'limit' with the
    157      * output text of this object.  Update the 'cursor' parameter to
    158      * give the cursor position and return the length of the
    159      * replacement text.
    160      *
    161      * @param text the text to be matched
    162      * @param start inclusive start index of text to be replaced
    163      * @param limit exclusive end index of text to be replaced;
    164      * must be greater than or equal to start
    165      * @param cursor output parameter for the cursor position.
    166      * Not all replacer objects will update this, but in a complete
    167      * tree of replacer objects, representing the entire output side
    168      * of a transliteration rule, at least one must update it.
    169      * @return the number of 16-bit code units in the text replacing
    170      * the characters at offsets start..(limit-1) in text
    171      */
    172     virtual int32_t replace(Replaceable& text,
    173                             int32_t start,
    174                             int32_t limit,
    175                             int32_t& cursor);
    176 
    177     /**
    178      * Returns a string representation of this replacer.  If the
    179      * result of calling this function is passed to the appropriate
    180      * parser, typically TransliteratorParser, it will produce another
    181      * replacer that is equal to this one.
    182      * @param result the string to receive the pattern.  Previous
    183      * contents will be deleted.
    184      * @param escapeUnprintable if TRUE then convert unprintable
    185      * character to their hex escape representations, \\uxxxx or
    186      * \\Uxxxxxxxx.  Unprintable characters are defined by
    187      * Utility.isUnprintable().
    188      * @return a reference to 'result'.
    189      */
    190     virtual UnicodeString& toReplacerPattern(UnicodeString& result,
    191                                              UBool escapeUnprintable) const;
    192 
    193     /**
    194      * Remove any match data.  This must be called before performing a
    195      * set of matches with this segment.
    196      */
    197     void resetMatch();
    198 
    199     /**
    200      * ICU "poor man's RTTI", returns a UClassID for the actual class.
    201      */
    202     virtual UClassID getDynamicClassID() const;
    203 
    204     /**
    205      * ICU "poor man's RTTI", returns a UClassID for this class.
    206      */
    207     static UClassID U_EXPORT2 getStaticClassID();
    208 
    209     /**
    210      * Union the set of all characters that may output by this object
    211      * into the given set.
    212      * @param toUnionTo the set into which to union the output characters
    213      */
    214     virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const;
    215 
    216  private:
    217 
    218     /**
    219      * The text to be matched.
    220      */
    221     UnicodeString pattern;
    222 
    223     /**
    224      * Context object that maps stand-ins to matcher and replacer
    225      * objects.
    226      */
    227     const TransliterationRuleData* data;
    228 
    229     /**
    230      * The segment number, 1-based, or 0 if not a segment.
    231      */
    232     int32_t segmentNumber;
    233 
    234     /**
    235      * Start offset, in the match text, of the <em>rightmost</em>
    236      * match.
    237      */
    238     int32_t matchStart;
    239 
    240     /**
    241      * Limit offset, in the match text, of the <em>rightmost</em>
    242      * match.
    243      */
    244     int32_t matchLimit;
    245 
    246 };
    247 
    248 U_NAMESPACE_END
    249 
    250 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
    251 
    252 #endif
    253