Home | History | Annotate | Download | only in unicode
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (C) 2002-2016, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 *   file name:  regex.h
      9 *   encoding:   UTF-8
     10 *   indentation:4
     11 *
     12 *   created on: 2002oct22
     13 *   created by: Andy Heninger
     14 *
     15 *   ICU Regular Expressions, API for C++
     16 */
     17 
     18 #ifndef REGEX_H
     19 #define REGEX_H
     20 
     21 //#define REGEX_DEBUG
     22 
     23 /**
     24  * \file
     25  * \brief  C++ API:  Regular Expressions
     26  *
     27  * <h2>Regular Expression API</h2>
     28  *
     29  * <p>The ICU API for processing regular expressions consists of two classes,
     30  *  <code>RegexPattern</code> and <code>RegexMatcher</code>.
     31  *  <code>RegexPattern</code> objects represent a pre-processed, or compiled
     32  *  regular expression.  They are created from a regular expression pattern string,
     33  *  and can be used to create <code>RegexMatcher</code> objects for the pattern.</p>
     34  *
     35  * <p>Class <code>RegexMatcher</code> bundles together a regular expression
     36  *  pattern and a target string to which the search pattern will be applied.
     37  *  <code>RegexMatcher</code> includes API for doing plain find or search
     38  *  operations, for search and replace operations, and for obtaining detailed
     39  *  information about bounds of a match. </p>
     40  *
     41  * <p>Note that by constructing <code>RegexMatcher</code> objects directly from regular
     42  * expression pattern strings application code can be simplified and the explicit
     43  * need for <code>RegexPattern</code> objects can usually be eliminated.
     44  * </p>
     45  */
     46 
     47 #include "unicode/utypes.h"
     48 
     49 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     50 
     51 #include "unicode/uobject.h"
     52 #include "unicode/unistr.h"
     53 #include "unicode/utext.h"
     54 #include "unicode/parseerr.h"
     55 
     56 #include "unicode/uregex.h"
     57 
     58 // Forward Declarations
     59 
     60 struct UHashtable;
     61 
     62 U_NAMESPACE_BEGIN
     63 
     64 struct Regex8BitSet;
     65 class  RegexCImpl;
     66 class  RegexMatcher;
     67 class  RegexPattern;
     68 struct REStackFrame;
     69 class  RuleBasedBreakIterator;
     70 class  UnicodeSet;
     71 class  UVector;
     72 class  UVector32;
     73 class  UVector64;
     74 
     75 
     76 /**
     77   * Class <code>RegexPattern</code> represents a compiled regular expression.  It includes
     78   * factory methods for creating a RegexPattern object from the source (string) form
     79   * of a regular expression, methods for creating RegexMatchers that allow the pattern
     80   * to be applied to input text, and a few convenience methods for simple common
     81   * uses of regular expressions.
     82   *
     83   * <p>Class RegexPattern is not intended to be subclassed.</p>
     84   *
     85   * @stable ICU 2.4
     86   */
     87 class U_I18N_API RegexPattern U_FINAL : public UObject {
     88 public:
     89 
     90     /**
     91      * default constructor.  Create a RegexPattern object that refers to no actual
     92      *   pattern.  Not normally needed; RegexPattern objects are usually
     93      *   created using the factory method <code>compile()</code>.
     94      *
     95      * @stable ICU 2.4
     96      */
     97     RegexPattern();
     98 
     99     /**
    100      * Copy Constructor.  Create a new RegexPattern object that is equivalent
    101      *                    to the source object.
    102      * @param source the pattern object to be copied.
    103      * @stable ICU 2.4
    104      */
    105     RegexPattern(const RegexPattern &source);
    106 
    107     /**
    108      * Destructor.  Note that a RegexPattern object must persist so long as any
    109      *  RegexMatcher objects that were created from the RegexPattern are active.
    110      * @stable ICU 2.4
    111      */
    112     virtual ~RegexPattern();
    113 
    114     /**
    115      * Comparison operator.  Two RegexPattern objects are considered equal if they
    116      * were constructed from identical source patterns using the same match flag
    117      * settings.
    118      * @param that a RegexPattern object to compare with "this".
    119      * @return TRUE if the objects are equivalent.
    120      * @stable ICU 2.4
    121      */
    122     UBool           operator==(const RegexPattern& that) const;
    123 
    124     /**
    125      * Comparison operator.  Two RegexPattern objects are considered equal if they
    126      * were constructed from identical source patterns using the same match flag
    127      * settings.
    128      * @param that a RegexPattern object to compare with "this".
    129      * @return TRUE if the objects are different.
    130      * @stable ICU 2.4
    131      */
    132     inline UBool    operator!=(const RegexPattern& that) const {return ! operator ==(that);}
    133 
    134     /**
    135      * Assignment operator.  After assignment, this RegexPattern will behave identically
    136      *     to the source object.
    137      * @stable ICU 2.4
    138      */
    139     RegexPattern  &operator =(const RegexPattern &source);
    140 
    141     /**
    142      * Create an exact copy of this RegexPattern object.  Since RegexPattern is not
    143      * intended to be subclassed, <code>clone()</code> and the copy construction are
    144      * equivalent operations.
    145      * @return the copy of this RegexPattern
    146      * @stable ICU 2.4
    147      */
    148     virtual RegexPattern  *clone() const;
    149 
    150 
    151    /**
    152     * Compiles the regular expression in string form into a RegexPattern
    153     * object.  These compile methods, rather than the constructors, are the usual
    154     * way that RegexPattern objects are created.
    155     *
    156     * <p>Note that RegexPattern objects must not be deleted while RegexMatcher
    157     * objects created from the pattern are active.  RegexMatchers keep a pointer
    158     * back to their pattern, so premature deletion of the pattern is a
    159     * catastrophic error.</p>
    160     *
    161     * <p>All pattern match mode flags are set to their default values.</p>
    162     *
    163     * <p>Note that it is often more convenient to construct a RegexMatcher directly
    164     *    from a pattern string rather than separately compiling the pattern and
    165     *    then creating a RegexMatcher object from the pattern.</p>
    166     *
    167     * @param regex The regular expression to be compiled.
    168     * @param pe    Receives the position (line and column nubers) of any error
    169     *              within the regular expression.)
    170     * @param status A reference to a UErrorCode to receive any errors.
    171     * @return      A regexPattern object for the compiled pattern.
    172     *
    173     * @stable ICU 2.4
    174     */
    175     static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
    176         UParseError          &pe,
    177         UErrorCode           &status);
    178 
    179    /**
    180     * Compiles the regular expression in string form into a RegexPattern
    181     * object.  These compile methods, rather than the constructors, are the usual
    182     * way that RegexPattern objects are created.
    183     *
    184     * <p>Note that RegexPattern objects must not be deleted while RegexMatcher
    185     * objects created from the pattern are active.  RegexMatchers keep a pointer
    186     * back to their pattern, so premature deletion of the pattern is a
    187     * catastrophic error.</p>
    188     *
    189     * <p>All pattern match mode flags are set to their default values.</p>
    190     *
    191     * <p>Note that it is often more convenient to construct a RegexMatcher directly
    192     *    from a pattern string rather than separately compiling the pattern and
    193     *    then creating a RegexMatcher object from the pattern.</p>
    194     *
    195     * @param regex The regular expression to be compiled. Note, the text referred
    196     *              to by this UText must not be deleted during the lifetime of the
    197     *              RegexPattern object or any RegexMatcher object created from it.
    198     * @param pe    Receives the position (line and column nubers) of any error
    199     *              within the regular expression.)
    200     * @param status A reference to a UErrorCode to receive any errors.
    201     * @return      A regexPattern object for the compiled pattern.
    202     *
    203     * @stable ICU 4.6
    204     */
    205     static RegexPattern * U_EXPORT2 compile( UText *regex,
    206         UParseError          &pe,
    207         UErrorCode           &status);
    208 
    209    /**
    210     * Compiles the regular expression in string form into a RegexPattern
    211     * object using the specified match mode flags.  These compile methods,
    212     * rather than the constructors, are the usual way that RegexPattern objects
    213     * are created.
    214     *
    215     * <p>Note that RegexPattern objects must not be deleted while RegexMatcher
    216     * objects created from the pattern are active.  RegexMatchers keep a pointer
    217     * back to their pattern, so premature deletion of the pattern is a
    218     * catastrophic error.</p>
    219     *
    220     * <p>Note that it is often more convenient to construct a RegexMatcher directly
    221     *    from a pattern string instead of than separately compiling the pattern and
    222     *    then creating a RegexMatcher object from the pattern.</p>
    223     *
    224     * @param regex The regular expression to be compiled.
    225     * @param flags The match mode flags to be used.
    226     * @param pe    Receives the position (line and column numbers) of any error
    227     *              within the regular expression.)
    228     * @param status   A reference to a UErrorCode to receive any errors.
    229     * @return      A regexPattern object for the compiled pattern.
    230     *
    231     * @stable ICU 2.4
    232     */
    233     static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
    234         uint32_t             flags,
    235         UParseError          &pe,
    236         UErrorCode           &status);
    237 
    238    /**
    239     * Compiles the regular expression in string form into a RegexPattern
    240     * object using the specified match mode flags.  These compile methods,
    241     * rather than the constructors, are the usual way that RegexPattern objects
    242     * are created.
    243     *
    244     * <p>Note that RegexPattern objects must not be deleted while RegexMatcher
    245     * objects created from the pattern are active.  RegexMatchers keep a pointer
    246     * back to their pattern, so premature deletion of the pattern is a
    247     * catastrophic error.</p>
    248     *
    249     * <p>Note that it is often more convenient to construct a RegexMatcher directly
    250     *    from a pattern string instead of than separately compiling the pattern and
    251     *    then creating a RegexMatcher object from the pattern.</p>
    252     *
    253     * @param regex The regular expression to be compiled. Note, the text referred
    254     *              to by this UText must not be deleted during the lifetime of the
    255     *              RegexPattern object or any RegexMatcher object created from it.
    256     * @param flags The match mode flags to be used.
    257     * @param pe    Receives the position (line and column numbers) of any error
    258     *              within the regular expression.)
    259     * @param status   A reference to a UErrorCode to receive any errors.
    260     * @return      A regexPattern object for the compiled pattern.
    261     *
    262     * @stable ICU 4.6
    263     */
    264     static RegexPattern * U_EXPORT2 compile( UText *regex,
    265         uint32_t             flags,
    266         UParseError          &pe,
    267         UErrorCode           &status);
    268 
    269    /**
    270     * Compiles the regular expression in string form into a RegexPattern
    271     * object using the specified match mode flags.  These compile methods,
    272     * rather than the constructors, are the usual way that RegexPattern objects
    273     * are created.
    274     *
    275     * <p>Note that RegexPattern objects must not be deleted while RegexMatcher
    276     * objects created from the pattern are active.  RegexMatchers keep a pointer
    277     * back to their pattern, so premature deletion of the pattern is a
    278     * catastrophic error.</p>
    279     *
    280     * <p>Note that it is often more convenient to construct a RegexMatcher directly
    281     *    from a pattern string instead of than separately compiling the pattern and
    282     *    then creating a RegexMatcher object from the pattern.</p>
    283     *
    284     * @param regex The regular expression to be compiled.
    285     * @param flags The match mode flags to be used.
    286     * @param status   A reference to a UErrorCode to receive any errors.
    287     * @return      A regexPattern object for the compiled pattern.
    288     *
    289     * @stable ICU 2.6
    290     */
    291     static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
    292         uint32_t             flags,
    293         UErrorCode           &status);
    294 
    295    /**
    296     * Compiles the regular expression in string form into a RegexPattern
    297     * object using the specified match mode flags.  These compile methods,
    298     * rather than the constructors, are the usual way that RegexPattern objects
    299     * are created.
    300     *
    301     * <p>Note that RegexPattern objects must not be deleted while RegexMatcher
    302     * objects created from the pattern are active.  RegexMatchers keep a pointer
    303     * back to their pattern, so premature deletion of the pattern is a
    304     * catastrophic error.</p>
    305     *
    306     * <p>Note that it is often more convenient to construct a RegexMatcher directly
    307     *    from a pattern string instead of than separately compiling the pattern and
    308     *    then creating a RegexMatcher object from the pattern.</p>
    309     *
    310     * @param regex The regular expression to be compiled. Note, the text referred
    311     *              to by this UText must not be deleted during the lifetime of the
    312     *              RegexPattern object or any RegexMatcher object created from it.
    313     * @param flags The match mode flags to be used.
    314     * @param status   A reference to a UErrorCode to receive any errors.
    315     * @return      A regexPattern object for the compiled pattern.
    316     *
    317     * @stable ICU 4.6
    318     */
    319     static RegexPattern * U_EXPORT2 compile( UText *regex,
    320         uint32_t             flags,
    321         UErrorCode           &status);
    322 
    323    /**
    324     * Get the match mode flags that were used when compiling this pattern.
    325     * @return  the match mode flags
    326     * @stable ICU 2.4
    327     */
    328     virtual uint32_t flags() const;
    329 
    330    /**
    331     * Creates a RegexMatcher that will match the given input against this pattern.  The
    332     * RegexMatcher can then be used to perform match, find or replace operations
    333     * on the input.  Note that a RegexPattern object must not be deleted while
    334     * RegexMatchers created from it still exist and might possibly be used again.
    335     * <p>
    336     * The matcher will retain a reference to the supplied input string, and all regexp
    337     * pattern matching operations happen directly on this original string.  It is
    338     * critical that the string not be altered or deleted before use by the regular
    339     * expression operations is complete.
    340     *
    341     * @param input    The input string to which the regular expression will be applied.
    342     * @param status   A reference to a UErrorCode to receive any errors.
    343     * @return         A RegexMatcher object for this pattern and input.
    344     *
    345     * @stable ICU 2.4
    346     */
    347     virtual RegexMatcher *matcher(const UnicodeString &input,
    348         UErrorCode          &status) const;
    349 
    350 private:
    351     /**
    352      * Cause a compilation error if an application accidentally attempts to
    353      *   create a matcher with a (char16_t *) string as input rather than
    354      *   a UnicodeString.  Avoids a dangling reference to a temporary string.
    355      * <p>
    356      * To efficiently work with char16_t *strings, wrap the data in a UnicodeString
    357      * using one of the aliasing constructors, such as
    358      * <code>UnicodeString(UBool isTerminated, const char16_t *text, int32_t textLength);</code>
    359      * or in a UText, using
    360      * <code>utext_openUChars(UText *ut, const char16_t *text, int64_t textLength, UErrorCode *status);</code>
    361      *
    362      */
    363     RegexMatcher *matcher(const char16_t *input,
    364         UErrorCode          &status) const;
    365 public:
    366 
    367 
    368    /**
    369     * Creates a RegexMatcher that will match against this pattern.  The
    370     * RegexMatcher can be used to perform match, find or replace operations.
    371     * Note that a RegexPattern object must not be deleted while
    372     * RegexMatchers created from it still exist and might possibly be used again.
    373     *
    374     * @param status   A reference to a UErrorCode to receive any errors.
    375     * @return      A RegexMatcher object for this pattern and input.
    376     *
    377     * @stable ICU 2.6
    378     */
    379     virtual RegexMatcher *matcher(UErrorCode  &status) const;
    380 
    381 
    382    /**
    383     * Test whether a string matches a regular expression.  This convenience function
    384     * both compiles the regular expression and applies it in a single operation.
    385     * Note that if the same pattern needs to be applied repeatedly, this method will be
    386     * less efficient than creating and reusing a RegexMatcher object.
    387     *
    388     * @param regex The regular expression
    389     * @param input The string data to be matched
    390     * @param pe Receives the position of any syntax errors within the regular expression
    391     * @param status A reference to a UErrorCode to receive any errors.
    392     * @return True if the regular expression exactly matches the full input string.
    393     *
    394     * @stable ICU 2.4
    395     */
    396     static UBool U_EXPORT2 matches(const UnicodeString   &regex,
    397         const UnicodeString   &input,
    398               UParseError     &pe,
    399               UErrorCode      &status);
    400 
    401    /**
    402     * Test whether a string matches a regular expression.  This convenience function
    403     * both compiles the regular expression and applies it in a single operation.
    404     * Note that if the same pattern needs to be applied repeatedly, this method will be
    405     * less efficient than creating and reusing a RegexMatcher object.
    406     *
    407     * @param regex The regular expression
    408     * @param input The string data to be matched
    409     * @param pe Receives the position of any syntax errors within the regular expression
    410     * @param status A reference to a UErrorCode to receive any errors.
    411     * @return True if the regular expression exactly matches the full input string.
    412     *
    413     * @stable ICU 4.6
    414     */
    415     static UBool U_EXPORT2 matches(UText *regex,
    416         UText           *input,
    417         UParseError     &pe,
    418         UErrorCode      &status);
    419 
    420    /**
    421     * Returns the regular expression from which this pattern was compiled. This method will work
    422     * even if the pattern was compiled from a UText.
    423     *
    424     * Note: If the pattern was originally compiled from a UText, and that UText was modified,
    425     * the returned string may no longer reflect the RegexPattern object.
    426     * @stable ICU 2.4
    427     */
    428     virtual UnicodeString pattern() const;
    429 
    430 
    431    /**
    432     * Returns the regular expression from which this pattern was compiled. This method will work
    433     * even if the pattern was compiled from a UnicodeString.
    434     *
    435     * Note: This is the original input, not a clone. If the pattern was originally compiled from a
    436     * UText, and that UText was modified, the returned UText may no longer reflect the RegexPattern
    437     * object.
    438     *
    439     * @stable ICU 4.6
    440     */
    441     virtual UText *patternText(UErrorCode      &status) const;
    442 
    443 
    444     /**
    445      * Get the group number corresponding to a named capture group.
    446      * The returned number can be used with any function that access
    447      * capture groups by number.
    448      *
    449      * The function returns an error status if the specified name does not
    450      * appear in the pattern.
    451      *
    452      * @param  groupName   The capture group name.
    453      * @param  status      A UErrorCode to receive any errors.
    454      *
    455      * @stable ICU 55
    456      */
    457     virtual int32_t groupNumberFromName(const UnicodeString &groupName, UErrorCode &status) const;
    458 
    459 
    460     /**
    461      * Get the group number corresponding to a named capture group.
    462      * The returned number can be used with any function that access
    463      * capture groups by number.
    464      *
    465      * The function returns an error status if the specified name does not
    466      * appear in the pattern.
    467      *
    468      * @param  groupName   The capture group name,
    469      *                     platform invariant characters only.
    470      * @param  nameLength  The length of the name, or -1 if the name is
    471      *                     nul-terminated.
    472      * @param  status      A UErrorCode to receive any errors.
    473      *
    474      * @stable ICU 55
    475      */
    476     virtual int32_t groupNumberFromName(const char *groupName, int32_t nameLength, UErrorCode &status) const;
    477 
    478 
    479     /**
    480      * Split a string into fields.  Somewhat like split() from Perl or Java.
    481      * Pattern matches identify delimiters that separate the input
    482      * into fields.  The input data between the delimiters becomes the
    483      * fields themselves.
    484      *
    485      * If the delimiter pattern includes capture groups, the captured text will
    486      * also appear in the destination array of output strings, interspersed
    487      * with the fields.  This is similar to Perl, but differs from Java,
    488      * which ignores the presence of capture groups in the pattern.
    489      *
    490      * Trailing empty fields will always be returned, assuming sufficient
    491      * destination capacity.  This differs from the default behavior for Java
    492      * and Perl where trailing empty fields are not returned.
    493      *
    494      * The number of strings produced by the split operation is returned.
    495      * This count includes the strings from capture groups in the delimiter pattern.
    496      * This behavior differs from Java, which ignores capture groups.
    497      *
    498      * For the best performance on split() operations,
    499      * <code>RegexMatcher::split</code> is preferable to this function
    500      *
    501      * @param input   The string to be split into fields.  The field delimiters
    502      *                match the pattern (in the "this" object)
    503      * @param dest    An array of UnicodeStrings to receive the results of the split.
    504      *                This is an array of actual UnicodeString objects, not an
    505      *                array of pointers to strings.  Local (stack based) arrays can
    506      *                work well here.
    507      * @param destCapacity  The number of elements in the destination array.
    508      *                If the number of fields found is less than destCapacity, the
    509      *                extra strings in the destination array are not altered.
    510      *                If the number of destination strings is less than the number
    511      *                of fields, the trailing part of the input string, including any
    512      *                field delimiters, is placed in the last destination string.
    513      * @param status  A reference to a UErrorCode to receive any errors.
    514      * @return        The number of fields into which the input string was split.
    515      * @stable ICU 2.4
    516      */
    517     virtual int32_t  split(const UnicodeString &input,
    518         UnicodeString    dest[],
    519         int32_t          destCapacity,
    520         UErrorCode       &status) const;
    521 
    522 
    523     /**
    524      * Split a string into fields.  Somewhat like split() from Perl or Java.
    525      * Pattern matches identify delimiters that separate the input
    526      * into fields.  The input data between the delimiters becomes the
    527      * fields themselves.
    528      *
    529      * If the delimiter pattern includes capture groups, the captured text will
    530      * also appear in the destination array of output strings, interspersed
    531      * with the fields.  This is similar to Perl, but differs from Java,
    532      * which ignores the presence of capture groups in the pattern.
    533      *
    534      * Trailing empty fields will always be returned, assuming sufficient
    535      * destination capacity.  This differs from the default behavior for Java
    536      * and Perl where trailing empty fields are not returned.
    537      *
    538      * The number of strings produced by the split operation is returned.
    539      * This count includes the strings from capture groups in the delimiter pattern.
    540      * This behavior differs from Java, which ignores capture groups.
    541      *
    542      *  For the best performance on split() operations,
    543      *  <code>RegexMatcher::split</code> is preferable to this function
    544      *
    545      * @param input   The string to be split into fields.  The field delimiters
    546      *                match the pattern (in the "this" object)
    547      * @param dest    An array of mutable UText structs to receive the results of the split.
    548      *                If a field is NULL, a new UText is allocated to contain the results for
    549      *                that field. This new UText is not guaranteed to be mutable.
    550      * @param destCapacity  The number of elements in the destination array.
    551      *                If the number of fields found is less than destCapacity, the
    552      *                extra strings in the destination array are not altered.
    553      *                If the number of destination strings is less than the number
    554      *                of fields, the trailing part of the input string, including any
    555      *                field delimiters, is placed in the last destination string.
    556      * @param status  A reference to a UErrorCode to receive any errors.
    557      * @return        The number of destination strings used.
    558      *
    559      * @stable ICU 4.6
    560      */
    561     virtual int32_t  split(UText *input,
    562         UText            *dest[],
    563         int32_t          destCapacity,
    564         UErrorCode       &status) const;
    565 
    566 
    567     /**
    568      * ICU "poor man's RTTI", returns a UClassID for the actual class.
    569      *
    570      * @stable ICU 2.4
    571      */
    572     virtual UClassID getDynamicClassID() const;
    573 
    574     /**
    575      * ICU "poor man's RTTI", returns a UClassID for this class.
    576      *
    577      * @stable ICU 2.4
    578      */
    579     static UClassID U_EXPORT2 getStaticClassID();
    580 
    581 private:
    582     //
    583     //  Implementation Data
    584     //
    585     UText          *fPattern;      // The original pattern string.
    586     UnicodeString  *fPatternString; // The original pattern UncodeString if relevant
    587     uint32_t        fFlags;        // The flags used when compiling the pattern.
    588                                    //
    589     UVector64       *fCompiledPat; // The compiled pattern p-code.
    590     UnicodeString   fLiteralText;  // Any literal string data from the pattern,
    591                                    //   after un-escaping, for use during the match.
    592 
    593     UVector         *fSets;        // Any UnicodeSets referenced from the pattern.
    594     Regex8BitSet    *fSets8;       //      (and fast sets for latin-1 range.)
    595 
    596 
    597     UErrorCode      fDeferredStatus; // status if some prior error has left this
    598                                    //  RegexPattern in an unusable state.
    599 
    600     int32_t         fMinMatchLen;  // Minimum Match Length.  All matches will have length
    601                                    //   >= this value.  For some patterns, this calculated
    602                                    //   value may be less than the true shortest
    603                                    //   possible match.
    604 
    605     int32_t         fFrameSize;    // Size of a state stack frame in the
    606                                    //   execution engine.
    607 
    608     int32_t         fDataSize;     // The size of the data needed by the pattern that
    609                                    //   does not go on the state stack, but has just
    610                                    //   a single copy per matcher.
    611 
    612     UVector32       *fGroupMap;    // Map from capture group number to position of
    613                                    //   the group's variables in the matcher stack frame.
    614 
    615     UnicodeSet     **fStaticSets;  // Ptr to static (shared) sets for predefined
    616                                    //   regex character classes, e.g. Word.
    617 
    618     Regex8BitSet   *fStaticSets8;  // Ptr to the static (shared) latin-1 only
    619                                    //  sets for predefined regex classes.
    620 
    621     int32_t         fStartType;    // Info on how a match must start.
    622     int32_t         fInitialStringIdx;     //
    623     int32_t         fInitialStringLen;
    624     UnicodeSet     *fInitialChars;
    625     UChar32         fInitialChar;
    626     Regex8BitSet   *fInitialChars8;
    627     UBool           fNeedsAltInput;
    628 
    629     UHashtable     *fNamedCaptureMap;  // Map from capture group names to numbers.
    630 
    631     friend class RegexCompile;
    632     friend class RegexMatcher;
    633     friend class RegexCImpl;
    634 
    635     //
    636     //  Implementation Methods
    637     //
    638     void        init();            // Common initialization, for use by constructors.
    639     void        zap();             // Common cleanup
    640 
    641     void        dumpOp(int32_t index) const;
    642 
    643   public:
    644 #ifndef U_HIDE_INTERNAL_API
    645     /**
    646       * Dump a compiled pattern. Internal debug function.
    647       * @internal
    648       */
    649     void        dumpPattern() const;
    650 #endif  /* U_HIDE_INTERNAL_API */
    651 };
    652 
    653 
    654 
    655 /**
    656  *  class RegexMatcher bundles together a regular expression pattern and
    657  *  input text to which the expression can be applied.  It includes methods
    658  *  for testing for matches, and for find and replace operations.
    659  *
    660  * <p>Class RegexMatcher is not intended to be subclassed.</p>
    661  *
    662  * @stable ICU 2.4
    663  */
    664 class U_I18N_API RegexMatcher U_FINAL : public UObject {
    665 public:
    666 
    667     /**
    668       * Construct a RegexMatcher for a regular expression.
    669       * This is a convenience method that avoids the need to explicitly create
    670       * a RegexPattern object.  Note that if several RegexMatchers need to be
    671       * created for the same expression, it will be more efficient to
    672       * separately create and cache a RegexPattern object, and use
    673       * its matcher() method to create the RegexMatcher objects.
    674       *
    675       *  @param regexp The Regular Expression to be compiled.
    676       *  @param flags  Regular expression options, such as case insensitive matching.
    677       *                @see UREGEX_CASE_INSENSITIVE
    678       *  @param status Any errors are reported by setting this UErrorCode variable.
    679       *  @stable ICU 2.6
    680       */
    681     RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status);
    682 
    683     /**
    684       * Construct a RegexMatcher for a regular expression.
    685       * This is a convenience method that avoids the need to explicitly create
    686       * a RegexPattern object.  Note that if several RegexMatchers need to be
    687       * created for the same expression, it will be more efficient to
    688       * separately create and cache a RegexPattern object, and use
    689       * its matcher() method to create the RegexMatcher objects.
    690       *
    691       *  @param regexp The regular expression to be compiled.
    692       *  @param flags  Regular expression options, such as case insensitive matching.
    693       *                @see UREGEX_CASE_INSENSITIVE
    694       *  @param status Any errors are reported by setting this UErrorCode variable.
    695       *
    696       *  @stable ICU 4.6
    697       */
    698     RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status);
    699 
    700     /**
    701       * Construct a RegexMatcher for a regular expression.
    702       * This is a convenience method that avoids the need to explicitly create
    703       * a RegexPattern object.  Note that if several RegexMatchers need to be
    704       * created for the same expression, it will be more efficient to
    705       * separately create and cache a RegexPattern object, and use
    706       * its matcher() method to create the RegexMatcher objects.
    707       * <p>
    708       * The matcher will retain a reference to the supplied input string, and all regexp
    709       * pattern matching operations happen directly on the original string.  It is
    710       * critical that the string not be altered or deleted before use by the regular
    711       * expression operations is complete.
    712       *
    713       *  @param regexp The Regular Expression to be compiled.
    714       *  @param input  The string to match.  The matcher retains a reference to the
    715       *                caller's string; mo copy is made.
    716       *  @param flags  Regular expression options, such as case insensitive matching.
    717       *                @see UREGEX_CASE_INSENSITIVE
    718       *  @param status Any errors are reported by setting this UErrorCode variable.
    719       *  @stable ICU 2.6
    720       */
    721     RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
    722         uint32_t flags, UErrorCode &status);
    723 
    724     /**
    725       * Construct a RegexMatcher for a regular expression.
    726       * This is a convenience method that avoids the need to explicitly create
    727       * a RegexPattern object.  Note that if several RegexMatchers need to be
    728       * created for the same expression, it will be more efficient to
    729       * separately create and cache a RegexPattern object, and use
    730       * its matcher() method to create the RegexMatcher objects.
    731       * <p>
    732       * The matcher will make a shallow clone of the supplied input text, and all regexp
    733       * pattern matching operations happen on this clone.  While read-only operations on
    734       * the supplied text are permitted, it is critical that the underlying string not be
    735       * altered or deleted before use by the regular expression operations is complete.
    736       *
    737       *  @param regexp The Regular Expression to be compiled.
    738       *  @param input  The string to match.  The matcher retains a shallow clone of the text.
    739       *  @param flags  Regular expression options, such as case insensitive matching.
    740       *                @see UREGEX_CASE_INSENSITIVE
    741       *  @param status Any errors are reported by setting this UErrorCode variable.
    742       *
    743       *  @stable ICU 4.6
    744       */
    745     RegexMatcher(UText *regexp, UText *input,
    746         uint32_t flags, UErrorCode &status);
    747 
    748 private:
    749     /**
    750      * Cause a compilation error if an application accidentally attempts to
    751      *   create a matcher with a (char16_t *) string as input rather than
    752      *   a UnicodeString.    Avoids a dangling reference to a temporary string.
    753      * <p>
    754      * To efficiently work with char16_t *strings, wrap the data in a UnicodeString
    755      * using one of the aliasing constructors, such as
    756      * <code>UnicodeString(UBool isTerminated, const char16_t *text, int32_t textLength);</code>
    757      * or in a UText, using
    758      * <code>utext_openUChars(UText *ut, const char16_t *text, int64_t textLength, UErrorCode *status);</code>
    759      *
    760      */
    761     RegexMatcher(const UnicodeString &regexp, const char16_t *input,
    762         uint32_t flags, UErrorCode &status);
    763 public:
    764 
    765 
    766    /**
    767     *   Destructor.
    768     *
    769     *  @stable ICU 2.4
    770     */
    771     virtual ~RegexMatcher();
    772 
    773 
    774    /**
    775     *   Attempts to match the entire input region against the pattern.
    776     *    @param   status     A reference to a UErrorCode to receive any errors.
    777     *    @return TRUE if there is a match
    778     *    @stable ICU 2.4
    779     */
    780     virtual UBool matches(UErrorCode &status);
    781 
    782 
    783    /**
    784     *   Resets the matcher, then attempts to match the input beginning
    785     *   at the specified startIndex, and extending to the end of the input.
    786     *   The input region is reset to include the entire input string.
    787     *   A successful match must extend to the end of the input.
    788     *    @param   startIndex The input string (native) index at which to begin matching.
    789     *    @param   status     A reference to a UErrorCode to receive any errors.
    790     *    @return TRUE if there is a match
    791     *    @stable ICU 2.8
    792     */
    793     virtual UBool matches(int64_t startIndex, UErrorCode &status);
    794 
    795 
    796    /**
    797     *   Attempts to match the input string, starting from the beginning of the region,
    798     *   against the pattern.  Like the matches() method, this function
    799     *   always starts at the beginning of the input region;
    800     *   unlike that function, it does not require that the entire region be matched.
    801     *
    802     *   <p>If the match succeeds then more information can be obtained via the <code>start()</code>,
    803     *     <code>end()</code>, and <code>group()</code> functions.</p>
    804     *
    805     *    @param   status     A reference to a UErrorCode to receive any errors.
    806     *    @return  TRUE if there is a match at the start of the input string.
    807     *    @stable ICU 2.4
    808     */
    809     virtual UBool lookingAt(UErrorCode &status);
    810 
    811 
    812   /**
    813     *   Attempts to match the input string, starting from the specified index, against the pattern.
    814     *   The match may be of any length, and is not required to extend to the end
    815     *   of the input string.  Contrast with match().
    816     *
    817     *   <p>If the match succeeds then more information can be obtained via the <code>start()</code>,
    818     *     <code>end()</code>, and <code>group()</code> functions.</p>
    819     *
    820     *    @param   startIndex The input string (native) index at which to begin matching.
    821     *    @param   status     A reference to a UErrorCode to receive any errors.
    822     *    @return  TRUE if there is a match.
    823     *    @stable ICU 2.8
    824     */
    825     virtual UBool lookingAt(int64_t startIndex, UErrorCode &status);
    826 
    827 
    828    /**
    829     *  Find the next pattern match in the input string.
    830     *  The find begins searching the input at the location following the end of
    831     *  the previous match, or at the start of the string if there is no previous match.
    832     *  If a match is found, <code>start(), end()</code> and <code>group()</code>
    833     *  will provide more information regarding the match.
    834     *  <p>Note that if the input string is changed by the application,
    835     *     use find(startPos, status) instead of find(), because the saved starting
    836     *     position may not be valid with the altered input string.</p>
    837     *  @return  TRUE if a match is found.
    838     *  @stable ICU 2.4
    839     */
    840     virtual UBool find();
    841 
    842 
    843    /**
    844     *  Find the next pattern match in the input string.
    845     *  The find begins searching the input at the location following the end of
    846     *  the previous match, or at the start of the string if there is no previous match.
    847     *  If a match is found, <code>start(), end()</code> and <code>group()</code>
    848     *  will provide more information regarding the match.
    849     *  <p>Note that if the input string is changed by the application,
    850     *     use find(startPos, status) instead of find(), because the saved starting
    851     *     position may not be valid with the altered input string.</p>
    852     *  @param   status  A reference to a UErrorCode to receive any errors.
    853     *  @return  TRUE if a match is found.
    854     * @stable ICU 55
    855     */
    856     virtual UBool find(UErrorCode &status);
    857 
    858    /**
    859     *   Resets this RegexMatcher and then attempts to find the next substring of the
    860     *   input string that matches the pattern, starting at the specified index.
    861     *
    862     *   @param   start     The (native) index in the input string to begin the search.
    863     *   @param   status    A reference to a UErrorCode to receive any errors.
    864     *   @return  TRUE if a match is found.
    865     *   @stable ICU 2.4
    866     */
    867     virtual UBool find(int64_t start, UErrorCode &status);
    868 
    869 
    870    /**
    871     *   Returns a string containing the text matched by the previous match.
    872     *   If the pattern can match an empty string, an empty string may be returned.
    873     *   @param   status      A reference to a UErrorCode to receive any errors.
    874     *                        Possible errors are  U_REGEX_INVALID_STATE if no match
    875     *                        has been attempted or the last match failed.
    876     *   @return  a string containing the matched input text.
    877     *   @stable ICU 2.4
    878     */
    879     virtual UnicodeString group(UErrorCode &status) const;
    880 
    881 
    882    /**
    883     *    Returns a string containing the text captured by the given group
    884     *    during the previous match operation.  Group(0) is the entire match.
    885     *
    886     *    A zero length string is returned both for capture groups that did not
    887     *    participate in the match and for actual zero length matches.
    888     *    To distinguish between these two cases use the function start(),
    889     *    which returns -1 for non-participating groups.
    890     *
    891     *    @param groupNum the capture group number
    892     *    @param   status     A reference to a UErrorCode to receive any errors.
    893     *                        Possible errors are  U_REGEX_INVALID_STATE if no match
    894     *                        has been attempted or the last match failed and
    895     *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
    896     *    @return the captured text
    897     *    @stable ICU 2.4
    898     */
    899     virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
    900 
    901    /**
    902     *   Returns the number of capturing groups in this matcher's pattern.
    903     *   @return the number of capture groups
    904     *   @stable ICU 2.4
    905     */
    906     virtual int32_t groupCount() const;
    907 
    908 
    909    /**
    910     *   Returns a shallow clone of the entire live input string with the UText current native index
    911     *   set to the beginning of the requested group.
    912     *
    913     *   @param   dest        The UText into which the input should be cloned, or NULL to create a new UText
    914     *   @param   group_len   A reference to receive the length of the desired capture group
    915     *   @param   status      A reference to a UErrorCode to receive any errors.
    916     *                        Possible errors are  U_REGEX_INVALID_STATE if no match
    917     *                        has been attempted or the last match failed and
    918     *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
    919     *   @return dest if non-NULL, a shallow copy of the input text otherwise
    920     *
    921     *   @stable ICU 4.6
    922     */
    923     virtual UText *group(UText *dest, int64_t &group_len, UErrorCode &status) const;
    924 
    925    /**
    926     *   Returns a shallow clone of the entire live input string with the UText current native index
    927     *   set to the beginning of the requested group.
    928     *
    929     *   A group length of zero is returned both for capture groups that did not
    930     *   participate in the match and for actual zero length matches.
    931     *   To distinguish between these two cases use the function start(),
    932     *   which returns -1 for non-participating groups.
    933     *
    934     *   @param   groupNum   The capture group number.
    935     *   @param   dest        The UText into which the input should be cloned, or NULL to create a new UText.
    936     *   @param   group_len   A reference to receive the length of the desired capture group
    937     *   @param   status      A reference to a UErrorCode to receive any errors.
    938     *                        Possible errors are  U_REGEX_INVALID_STATE if no match
    939     *                        has been attempted or the last match failed and
    940     *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
    941     *   @return dest if non-NULL, a shallow copy of the input text otherwise
    942     *
    943     *   @stable ICU 4.6
    944     */
    945     virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const;
    946 
    947    /**
    948     *   Returns the index in the input string of the start of the text matched
    949     *   during the previous match operation.
    950     *    @param   status      a reference to a UErrorCode to receive any errors.
    951     *    @return              The (native) position in the input string of the start of the last match.
    952     *    @stable ICU 2.4
    953     */
    954     virtual int32_t start(UErrorCode &status) const;
    955 
    956    /**
    957     *   Returns the index in the input string of the start of the text matched
    958     *   during the previous match operation.
    959     *    @param   status      a reference to a UErrorCode to receive any errors.
    960     *    @return              The (native) position in the input string of the start of the last match.
    961     *   @stable ICU 4.6
    962     */
    963     virtual int64_t start64(UErrorCode &status) const;
    964 
    965 
    966    /**
    967     *   Returns the index in the input string of the start of the text matched by the
    968     *    specified capture group during the previous match operation.  Return -1 if
    969     *    the capture group exists in the pattern, but was not part of the last match.
    970     *
    971     *    @param  group       the capture group number
    972     *    @param  status      A reference to a UErrorCode to receive any errors.  Possible
    973     *                        errors are  U_REGEX_INVALID_STATE if no match has been
    974     *                        attempted or the last match failed, and
    975     *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
    976     *    @return the (native) start position of substring matched by the specified group.
    977     *    @stable ICU 2.4
    978     */
    979     virtual int32_t start(int32_t group, UErrorCode &status) const;
    980 
    981    /**
    982     *   Returns the index in the input string of the start of the text matched by the
    983     *    specified capture group during the previous match operation.  Return -1 if
    984     *    the capture group exists in the pattern, but was not part of the last match.
    985     *
    986     *    @param  group       the capture group number.
    987     *    @param  status      A reference to a UErrorCode to receive any errors.  Possible
    988     *                        errors are  U_REGEX_INVALID_STATE if no match has been
    989     *                        attempted or the last match failed, and
    990     *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
    991     *    @return the (native) start position of substring matched by the specified group.
    992     *    @stable ICU 4.6
    993     */
    994     virtual int64_t start64(int32_t group, UErrorCode &status) const;
    995 
    996    /**
    997     *    Returns the index in the input string of the first character following the
    998     *    text matched during the previous match operation.
    999     *
   1000     *   @param   status      A reference to a UErrorCode to receive any errors.  Possible
   1001     *                        errors are  U_REGEX_INVALID_STATE if no match has been
   1002     *                        attempted or the last match failed.
   1003     *    @return the index of the last character matched, plus one.
   1004     *                        The index value returned is a native index, corresponding to
   1005     *                        code units for the underlying encoding type, for example,
   1006     *                        a byte index for UTF-8.
   1007     *   @stable ICU 2.4
   1008     */
   1009     virtual int32_t end(UErrorCode &status) const;
   1010 
   1011    /**
   1012     *    Returns the index in the input string of the first character following the
   1013     *    text matched during the previous match operation.
   1014     *
   1015     *   @param   status      A reference to a UErrorCode to receive any errors.  Possible
   1016     *                        errors are  U_REGEX_INVALID_STATE if no match has been
   1017     *                        attempted or the last match failed.
   1018     *    @return the index of the last character matched, plus one.
   1019     *                        The index value returned is a native index, corresponding to
   1020     *                        code units for the underlying encoding type, for example,
   1021     *                        a byte index for UTF-8.
   1022     *   @stable ICU 4.6
   1023     */
   1024     virtual int64_t end64(UErrorCode &status) const;
   1025 
   1026 
   1027    /**
   1028     *    Returns the index in the input string of the character following the
   1029     *    text matched by the specified capture group during the previous match operation.
   1030     *
   1031     *    @param group  the capture group number
   1032     *    @param   status      A reference to a UErrorCode to receive any errors.  Possible
   1033     *                        errors are  U_REGEX_INVALID_STATE if no match has been
   1034     *                        attempted or the last match failed and
   1035     *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
   1036     *    @return  the index of the first character following the text
   1037     *              captured by the specified group during the previous match operation.
   1038     *              Return -1 if the capture group exists in the pattern but was not part of the match.
   1039     *              The index value returned is a native index, corresponding to
   1040     *              code units for the underlying encoding type, for example,
   1041     *              a byte index for UTF8.
   1042     *    @stable ICU 2.4
   1043     */
   1044     virtual int32_t end(int32_t group, UErrorCode &status) const;
   1045 
   1046    /**
   1047     *    Returns the index in the input string of the character following the
   1048     *    text matched by the specified capture group during the previous match operation.
   1049     *
   1050     *    @param group  the capture group number
   1051     *    @param   status      A reference to a UErrorCode to receive any errors.  Possible
   1052     *                        errors are  U_REGEX_INVALID_STATE if no match has been
   1053     *                        attempted or the last match failed and
   1054     *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
   1055     *    @return  the index of the first character following the text
   1056     *              captured by the specified group during the previous match operation.
   1057     *              Return -1 if the capture group exists in the pattern but was not part of the match.
   1058     *              The index value returned is a native index, corresponding to
   1059     *              code units for the underlying encoding type, for example,
   1060     *              a byte index for UTF8.
   1061     *   @stable ICU 4.6
   1062     */
   1063     virtual int64_t end64(int32_t group, UErrorCode &status) const;
   1064 
   1065    /**
   1066     *   Resets this matcher.  The effect is to remove any memory of previous matches,
   1067     *       and to cause subsequent find() operations to begin at the beginning of
   1068     *       the input string.
   1069     *
   1070     *   @return this RegexMatcher.
   1071     *   @stable ICU 2.4
   1072     */
   1073     virtual RegexMatcher &reset();
   1074 
   1075 
   1076    /**
   1077     *   Resets this matcher, and set the current input position.
   1078     *   The effect is to remove any memory of previous matches,
   1079     *       and to cause subsequent find() operations to begin at
   1080     *       the specified (native) position in the input string.
   1081     * <p>
   1082     *   The matcher's region is reset to its default, which is the entire
   1083     *   input string.
   1084     * <p>
   1085     *   An alternative to this function is to set a match region
   1086     *   beginning at the desired index.
   1087     *
   1088     *   @return this RegexMatcher.
   1089     *   @stable ICU 2.8
   1090     */
   1091     virtual RegexMatcher &reset(int64_t index, UErrorCode &status);
   1092 
   1093 
   1094    /**
   1095     *   Resets this matcher with a new input string.  This allows instances of RegexMatcher
   1096     *     to be reused, which is more efficient than creating a new RegexMatcher for
   1097     *     each input string to be processed.
   1098     *   @param input The new string on which subsequent pattern matches will operate.
   1099     *                The matcher retains a reference to the callers string, and operates
   1100     *                directly on that.  Ownership of the string remains with the caller.
   1101     *                Because no copy of the string is made, it is essential that the
   1102     *                caller not delete the string until after regexp operations on it
   1103     *                are done.
   1104     *                Note that while a reset on the matcher with an input string that is then
   1105     *                modified across/during matcher operations may be supported currently for UnicodeString,
   1106     *                this was not originally intended behavior, and support for this is not guaranteed
   1107     *                in upcoming versions of ICU.
   1108     *   @return this RegexMatcher.
   1109     *   @stable ICU 2.4
   1110     */
   1111     virtual RegexMatcher &reset(const UnicodeString &input);
   1112 
   1113 
   1114    /**
   1115     *   Resets this matcher with a new input string.  This allows instances of RegexMatcher
   1116     *     to be reused, which is more efficient than creating a new RegexMatcher for
   1117     *     each input string to be processed.
   1118     *   @param input The new string on which subsequent pattern matches will operate.
   1119     *                The matcher makes a shallow clone of the given text; ownership of the
   1120     *                original string remains with the caller. Because no deep copy of the
   1121     *                text is made, it is essential that the caller not modify the string
   1122     *                until after regexp operations on it are done.
   1123     *   @return this RegexMatcher.
   1124     *
   1125     *   @stable ICU 4.6
   1126     */
   1127     virtual RegexMatcher &reset(UText *input);
   1128 
   1129 
   1130   /**
   1131     *  Set the subject text string upon which the regular expression is looking for matches
   1132     *  without changing any other aspect of the matching state.
   1133     *  The new and previous text strings must have the same content.
   1134     *
   1135     *  This function is intended for use in environments where ICU is operating on
   1136     *  strings that may move around in memory.  It provides a mechanism for notifying
   1137     *  ICU that the string has been relocated, and providing a new UText to access the
   1138     *  string in its new position.
   1139     *
   1140     *  Note that the regular expression implementation never copies the underlying text
   1141     *  of a string being matched, but always operates directly on the original text
   1142     *  provided by the user. Refreshing simply drops the references to the old text
   1143     *  and replaces them with references to the new.
   1144     *
   1145     *  Caution:  this function is normally used only by very specialized,
   1146     *  system-level code.  One example use case is with garbage collection that moves
   1147     *  the text in memory.
   1148     *
   1149     * @param input      The new (moved) text string.
   1150     * @param status     Receives errors detected by this function.
   1151     *
   1152     * @stable ICU 4.8
   1153     */
   1154     virtual RegexMatcher &refreshInputText(UText *input, UErrorCode &status);
   1155 
   1156 private:
   1157     /**
   1158      * Cause a compilation error if an application accidentally attempts to
   1159      *   reset a matcher with a (char16_t *) string as input rather than
   1160      *   a UnicodeString.    Avoids a dangling reference to a temporary string.
   1161      * <p>
   1162      * To efficiently work with char16_t *strings, wrap the data in a UnicodeString
   1163      * using one of the aliasing constructors, such as
   1164      * <code>UnicodeString(UBool isTerminated, const char16_t *text, int32_t textLength);</code>
   1165      * or in a UText, using
   1166      * <code>utext_openUChars(UText *ut, const char16_t *text, int64_t textLength, UErrorCode *status);</code>
   1167      *
   1168      */
   1169     RegexMatcher &reset(const char16_t *input);
   1170 public:
   1171 
   1172    /**
   1173     *   Returns the input string being matched.  Ownership of the string belongs to
   1174     *   the matcher; it should not be altered or deleted. This method will work even if the input
   1175     *   was originally supplied as a UText.
   1176     *   @return the input string
   1177     *   @stable ICU 2.4
   1178     */
   1179     virtual const UnicodeString &input() const;
   1180 
   1181    /**
   1182     *   Returns the input string being matched.  This is the live input text; it should not be
   1183     *   altered or deleted. This method will work even if the input was originally supplied as
   1184     *   a UnicodeString.
   1185     *   @return the input text
   1186     *
   1187     *   @stable ICU 4.6
   1188     */
   1189     virtual UText *inputText() const;
   1190 
   1191    /**
   1192     *   Returns the input string being matched, either by copying it into the provided
   1193     *   UText parameter or by returning a shallow clone of the live input. Note that copying
   1194     *   the entire input may cause significant performance and memory issues.
   1195     *   @param dest The UText into which the input should be copied, or NULL to create a new UText
   1196     *   @param status error code
   1197     *   @return dest if non-NULL, a shallow copy of the input text otherwise
   1198     *
   1199     *   @stable ICU 4.6
   1200     */
   1201     virtual UText *getInput(UText *dest, UErrorCode &status) const;
   1202 
   1203 
   1204    /** Sets the limits of this matcher's region.
   1205      * The region is the part of the input string that will be searched to find a match.
   1206      * Invoking this method resets the matcher, and then sets the region to start
   1207      * at the index specified by the start parameter and end at the index specified
   1208      * by the end parameter.
   1209      *
   1210      * Depending on the transparency and anchoring being used (see useTransparentBounds
   1211      * and useAnchoringBounds), certain constructs such as anchors may behave differently
   1212      * at or around the boundaries of the region
   1213      *
   1214      * The function will fail if start is greater than limit, or if either index
   1215      *  is less than zero or greater than the length of the string being matched.
   1216      *
   1217      * @param start  The (native) index to begin searches at.
   1218      * @param limit  The index to end searches at (exclusive).
   1219      * @param status A reference to a UErrorCode to receive any errors.
   1220      * @stable ICU 4.0
   1221      */
   1222      virtual RegexMatcher &region(int64_t start, int64_t limit, UErrorCode &status);
   1223 
   1224    /**
   1225      * Identical to region(start, limit, status) but also allows a start position without
   1226      *  resetting the region state.
   1227      * @param regionStart The region start
   1228      * @param regionLimit the limit of the region
   1229      * @param startIndex  The (native) index within the region bounds at which to begin searches.
   1230      * @param status A reference to a UErrorCode to receive any errors.
   1231      *                If startIndex is not within the specified region bounds,
   1232      *                U_INDEX_OUTOFBOUNDS_ERROR is returned.
   1233      * @stable ICU 4.6
   1234      */
   1235      virtual RegexMatcher &region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status);
   1236 
   1237    /**
   1238      * Reports the start index of this matcher's region. The searches this matcher
   1239      * conducts are limited to finding matches within regionStart (inclusive) and
   1240      * regionEnd (exclusive).
   1241      *
   1242      * @return The starting (native) index of this matcher's region.
   1243      * @stable ICU 4.0
   1244      */
   1245      virtual int32_t regionStart() const;
   1246 
   1247    /**
   1248      * Reports the start index of this matcher's region. The searches this matcher
   1249      * conducts are limited to finding matches within regionStart (inclusive) and
   1250      * regionEnd (exclusive).
   1251      *
   1252      * @return The starting (native) index of this matcher's region.
   1253      * @stable ICU 4.6
   1254      */
   1255      virtual int64_t regionStart64() const;
   1256 
   1257 
   1258     /**
   1259       * Reports the end (limit) index (exclusive) of this matcher's region. The searches
   1260       * this matcher conducts are limited to finding matches within regionStart
   1261       * (inclusive) and regionEnd (exclusive).
   1262       *
   1263       * @return The ending point (native) of this matcher's region.
   1264       * @stable ICU 4.0
   1265       */
   1266       virtual int32_t regionEnd() const;
   1267 
   1268    /**
   1269      * Reports the end (limit) index (exclusive) of this matcher's region. The searches
   1270      * this matcher conducts are limited to finding matches within regionStart
   1271      * (inclusive) and regionEnd (exclusive).
   1272      *
   1273      * @return The ending point (native) of this matcher's region.
   1274      * @stable ICU 4.6
   1275      */
   1276       virtual int64_t regionEnd64() const;
   1277 
   1278     /**
   1279       * Queries the transparency of region bounds for this matcher.
   1280       * See useTransparentBounds for a description of transparent and opaque bounds.
   1281       * By default, a matcher uses opaque region boundaries.
   1282       *
   1283       * @return TRUE if this matcher is using opaque bounds, false if it is not.
   1284       * @stable ICU 4.0
   1285       */
   1286       virtual UBool hasTransparentBounds() const;
   1287 
   1288     /**
   1289       * Sets the transparency of region bounds for this matcher.
   1290       * Invoking this function with an argument of true will set this matcher to use transparent bounds.
   1291       * If the boolean argument is false, then opaque bounds will be used.
   1292       *
   1293       * Using transparent bounds, the boundaries of this matcher's region are transparent
   1294       * to lookahead, lookbehind, and boundary matching constructs. Those constructs can
   1295       * see text beyond the boundaries of the region while checking for a match.
   1296       *
   1297       * With opaque bounds, no text outside of the matcher's region is visible to lookahead,
   1298       * lookbehind, and boundary matching constructs.
   1299       *
   1300       * By default, a matcher uses opaque bounds.
   1301       *
   1302       * @param   b TRUE for transparent bounds; FALSE for opaque bounds
   1303       * @return  This Matcher;
   1304       * @stable ICU 4.0
   1305       **/
   1306       virtual RegexMatcher &useTransparentBounds(UBool b);
   1307 
   1308 
   1309     /**
   1310       * Return true if this matcher is using anchoring bounds.
   1311       * By default, matchers use anchoring region bounds.
   1312       *
   1313       * @return TRUE if this matcher is using anchoring bounds.
   1314       * @stable ICU 4.0
   1315       */
   1316       virtual UBool hasAnchoringBounds() const;
   1317 
   1318 
   1319     /**
   1320       * Set whether this matcher is using Anchoring Bounds for its region.
   1321       * With anchoring bounds, pattern anchors such as ^ and $ will match at the start
   1322       * and end of the region.  Without Anchoring Bounds, anchors will only match at
   1323       * the positions they would in the complete text.
   1324       *
   1325       * Anchoring Bounds are the default for regions.
   1326       *
   1327       * @param b TRUE if to enable anchoring bounds; FALSE to disable them.
   1328       * @return  This Matcher
   1329       * @stable ICU 4.0
   1330       */
   1331       virtual RegexMatcher &useAnchoringBounds(UBool b);
   1332 
   1333 
   1334     /**
   1335       * Return TRUE if the most recent matching operation attempted to access
   1336       *  additional input beyond the available input text.
   1337       *  In this case, additional input text could change the results of the match.
   1338       *
   1339       *  hitEnd() is defined for both successful and unsuccessful matches.
   1340       *  In either case hitEnd() will return TRUE if if the end of the text was
   1341       *  reached at any point during the matching process.
   1342       *
   1343       *  @return  TRUE if the most recent match hit the end of input
   1344       *  @stable ICU 4.0
   1345       */
   1346       virtual UBool hitEnd() const;
   1347 
   1348     /**
   1349       * Return TRUE the most recent match succeeded and additional input could cause
   1350       * it to fail. If this method returns false and a match was found, then more input
   1351       * might change the match but the match won't be lost. If a match was not found,
   1352       * then requireEnd has no meaning.
   1353       *
   1354       * @return TRUE if more input could cause the most recent match to no longer match.
   1355       * @stable ICU 4.0
   1356       */
   1357       virtual UBool requireEnd() const;
   1358 
   1359 
   1360    /**
   1361     *    Returns the pattern that is interpreted by this matcher.
   1362     *    @return  the RegexPattern for this RegexMatcher
   1363     *    @stable ICU 2.4
   1364     */
   1365     virtual const RegexPattern &pattern() const;
   1366 
   1367 
   1368    /**
   1369     *    Replaces every substring of the input that matches the pattern
   1370     *    with the given replacement string.  This is a convenience function that
   1371     *    provides a complete find-and-replace-all operation.
   1372     *
   1373     *    This method first resets this matcher. It then scans the input string
   1374     *    looking for matches of the pattern. Input that is not part of any
   1375     *    match is left unchanged; each match is replaced in the result by the
   1376     *    replacement string. The replacement string may contain references to
   1377     *    capture groups.
   1378     *
   1379     *    @param   replacement a string containing the replacement text.
   1380     *    @param   status      a reference to a UErrorCode to receive any errors.
   1381     *    @return              a string containing the results of the find and replace.
   1382     *    @stable ICU 2.4
   1383     */
   1384     virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
   1385 
   1386 
   1387    /**
   1388     *    Replaces every substring of the input that matches the pattern
   1389     *    with the given replacement string.  This is a convenience function that
   1390     *    provides a complete find-and-replace-all operation.
   1391     *
   1392     *    This method first resets this matcher. It then scans the input string
   1393     *    looking for matches of the pattern. Input that is not part of any
   1394     *    match is left unchanged; each match is replaced in the result by the
   1395     *    replacement string. The replacement string may contain references to
   1396     *    capture groups.
   1397     *
   1398     *    @param   replacement a string containing the replacement text.
   1399     *    @param   dest        a mutable UText in which the results are placed.
   1400     *                          If NULL, a new UText will be created (which may not be mutable).
   1401     *    @param   status      a reference to a UErrorCode to receive any errors.
   1402     *    @return              a string containing the results of the find and replace.
   1403     *                          If a pre-allocated UText was provided, it will always be used and returned.
   1404     *
   1405     *    @stable ICU 4.6
   1406     */
   1407     virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status);
   1408 
   1409 
   1410    /**
   1411     * Replaces the first substring of the input that matches
   1412     * the pattern with the replacement string.   This is a convenience
   1413     * function that provides a complete find-and-replace operation.
   1414     *
   1415     * <p>This function first resets this RegexMatcher. It then scans the input string
   1416     * looking for a match of the pattern. Input that is not part
   1417     * of the match is appended directly to the result string; the match is replaced
   1418     * in the result by the replacement string. The replacement string may contain
   1419     * references to captured groups.</p>
   1420     *
   1421     * <p>The state of the matcher (the position at which a subsequent find()
   1422     *    would begin) after completing a replaceFirst() is not specified.  The
   1423     *    RegexMatcher should be reset before doing additional find() operations.</p>
   1424     *
   1425     *    @param   replacement a string containing the replacement text.
   1426     *    @param   status      a reference to a UErrorCode to receive any errors.
   1427     *    @return              a string containing the results of the find and replace.
   1428     *    @stable ICU 2.4
   1429     */
   1430     virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
   1431 
   1432 
   1433    /**
   1434     * Replaces the first substring of the input that matches
   1435     * the pattern with the replacement string.   This is a convenience
   1436     * function that provides a complete find-and-replace operation.
   1437     *
   1438     * <p>This function first resets this RegexMatcher. It then scans the input string
   1439     * looking for a match of the pattern. Input that is not part
   1440     * of the match is appended directly to the result string; the match is replaced
   1441     * in the result by the replacement string. The replacement string may contain
   1442     * references to captured groups.</p>
   1443     *
   1444     * <p>The state of the matcher (the position at which a subsequent find()
   1445     *    would begin) after completing a replaceFirst() is not specified.  The
   1446     *    RegexMatcher should be reset before doing additional find() operations.</p>
   1447     *
   1448     *    @param   replacement a string containing the replacement text.
   1449     *    @param   dest        a mutable UText in which the results are placed.
   1450     *                          If NULL, a new UText will be created (which may not be mutable).
   1451     *    @param   status      a reference to a UErrorCode to receive any errors.
   1452     *    @return              a string containing the results of the find and replace.
   1453     *                          If a pre-allocated UText was provided, it will always be used and returned.
   1454     *
   1455     *    @stable ICU 4.6
   1456     */
   1457     virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status);
   1458 
   1459 
   1460    /**
   1461     *   Implements a replace operation intended to be used as part of an
   1462     *   incremental find-and-replace.
   1463     *
   1464     *   <p>The input string, starting from the end of the previous replacement and ending at
   1465     *   the start of the current match, is appended to the destination string.  Then the
   1466     *   replacement string is appended to the output string,
   1467     *   including handling any substitutions of captured text.</p>
   1468     *
   1469     *   <p>For simple, prepackaged, non-incremental find-and-replace
   1470     *   operations, see replaceFirst() or replaceAll().</p>
   1471     *
   1472     *   @param   dest        A UnicodeString to which the results of the find-and-replace are appended.
   1473     *   @param   replacement A UnicodeString that provides the text to be substituted for
   1474     *                        the input text that matched the regexp pattern.  The replacement
   1475     *                        text may contain references to captured text from the
   1476     *                        input.
   1477     *   @param   status      A reference to a UErrorCode to receive any errors.  Possible
   1478     *                        errors are  U_REGEX_INVALID_STATE if no match has been
   1479     *                        attempted or the last match failed, and U_INDEX_OUTOFBOUNDS_ERROR
   1480     *                        if the replacement text specifies a capture group that
   1481     *                        does not exist in the pattern.
   1482     *
   1483     *   @return  this  RegexMatcher
   1484     *   @stable ICU 2.4
   1485     *
   1486     */
   1487     virtual RegexMatcher &appendReplacement(UnicodeString &dest,
   1488         const UnicodeString &replacement, UErrorCode &status);
   1489 
   1490 
   1491    /**
   1492     *   Implements a replace operation intended to be used as part of an
   1493     *   incremental find-and-replace.
   1494     *
   1495     *   <p>The input string, starting from the end of the previous replacement and ending at
   1496     *   the start of the current match, is appended to the destination string.  Then the
   1497     *   replacement string is appended to the output string,
   1498     *   including handling any substitutions of captured text.</p>
   1499     *
   1500     *   <p>For simple, prepackaged, non-incremental find-and-replace
   1501     *   operations, see replaceFirst() or replaceAll().</p>
   1502     *
   1503     *   @param   dest        A mutable UText to which the results of the find-and-replace are appended.
   1504     *                         Must not be NULL.
   1505     *   @param   replacement A UText that provides the text to be substituted for
   1506     *                        the input text that matched the regexp pattern.  The replacement
   1507     *                        text may contain references to captured text from the input.
   1508     *   @param   status      A reference to a UErrorCode to receive any errors.  Possible
   1509     *                        errors are  U_REGEX_INVALID_STATE if no match has been
   1510     *                        attempted or the last match failed, and U_INDEX_OUTOFBOUNDS_ERROR
   1511     *                        if the replacement text specifies a capture group that
   1512     *                        does not exist in the pattern.
   1513     *
   1514     *   @return  this  RegexMatcher
   1515     *
   1516     *   @stable ICU 4.6
   1517     */
   1518     virtual RegexMatcher &appendReplacement(UText *dest,
   1519         UText *replacement, UErrorCode &status);
   1520 
   1521 
   1522    /**
   1523     * As the final step in a find-and-replace operation, append the remainder
   1524     * of the input string, starting at the position following the last appendReplacement(),
   1525     * to the destination string. <code>appendTail()</code> is intended to be invoked after one
   1526     * or more invocations of the <code>RegexMatcher::appendReplacement()</code>.
   1527     *
   1528     *  @param dest A UnicodeString to which the results of the find-and-replace are appended.
   1529     *  @return  the destination string.
   1530     *  @stable ICU 2.4
   1531     */
   1532     virtual UnicodeString &appendTail(UnicodeString &dest);
   1533 
   1534 
   1535    /**
   1536     * As the final step in a find-and-replace operation, append the remainder
   1537     * of the input string, starting at the position following the last appendReplacement(),
   1538     * to the destination string. <code>appendTail()</code> is intended to be invoked after one
   1539     * or more invocations of the <code>RegexMatcher::appendReplacement()</code>.
   1540     *
   1541     *  @param dest A mutable UText to which the results of the find-and-replace are appended.
   1542     *               Must not be NULL.
   1543     *  @param status error cod
   1544     *  @return  the destination string.
   1545     *
   1546     *  @stable ICU 4.6
   1547     */
   1548     virtual UText *appendTail(UText *dest, UErrorCode &status);
   1549 
   1550 
   1551     /**
   1552      * Split a string into fields.  Somewhat like split() from Perl.
   1553      * The pattern matches identify delimiters that separate the input
   1554      *  into fields.  The input data between the matches becomes the
   1555      *  fields themselves.
   1556      *
   1557      * @param input   The string to be split into fields.  The field delimiters
   1558      *                match the pattern (in the "this" object).  This matcher
   1559      *                will be reset to this input string.
   1560      * @param dest    An array of UnicodeStrings to receive the results of the split.
   1561      *                This is an array of actual UnicodeString objects, not an
   1562      *                array of pointers to strings.  Local (stack based) arrays can
   1563      *                work well here.
   1564      * @param destCapacity  The number of elements in the destination array.
   1565      *                If the number of fields found is less than destCapacity, the
   1566      *                extra strings in the destination array are not altered.
   1567      *                If the number of destination strings is less than the number
   1568      *                of fields, the trailing part of the input string, including any
   1569      *                field delimiters, is placed in the last destination string.
   1570      * @param status  A reference to a UErrorCode to receive any errors.
   1571      * @return        The number of fields into which the input string was split.
   1572      * @stable ICU 2.6
   1573      */
   1574     virtual int32_t  split(const UnicodeString &input,
   1575         UnicodeString    dest[],
   1576         int32_t          destCapacity,
   1577         UErrorCode       &status);
   1578 
   1579 
   1580     /**
   1581      * Split a string into fields.  Somewhat like split() from Perl.
   1582      * The pattern matches identify delimiters that separate the input
   1583      *  into fields.  The input data between the matches becomes the
   1584      *  fields themselves.
   1585      *
   1586      * @param input   The string to be split into fields.  The field delimiters
   1587      *                match the pattern (in the "this" object).  This matcher
   1588      *                will be reset to this input string.
   1589      * @param dest    An array of mutable UText structs to receive the results of the split.
   1590      *                If a field is NULL, a new UText is allocated to contain the results for
   1591      *                that field. This new UText is not guaranteed to be mutable.
   1592      * @param destCapacity  The number of elements in the destination array.
   1593      *                If the number of fields found is less than destCapacity, the
   1594      *                extra strings in the destination array are not altered.
   1595      *                If the number of destination strings is less than the number
   1596      *                of fields, the trailing part of the input string, including any
   1597      *                field delimiters, is placed in the last destination string.
   1598      * @param status  A reference to a UErrorCode to receive any errors.
   1599      * @return        The number of fields into which the input string was split.
   1600      *
   1601      * @stable ICU 4.6
   1602      */
   1603     virtual int32_t  split(UText *input,
   1604         UText           *dest[],
   1605         int32_t          destCapacity,
   1606         UErrorCode       &status);
   1607 
   1608   /**
   1609     *   Set a processing time limit for match operations with this Matcher.
   1610     *
   1611     *   Some patterns, when matching certain strings, can run in exponential time.
   1612     *   For practical purposes, the match operation may appear to be in an
   1613     *   infinite loop.
   1614     *   When a limit is set a match operation will fail with an error if the
   1615     *   limit is exceeded.
   1616     *   <p>
   1617     *   The units of the limit are steps of the match engine.
   1618     *   Correspondence with actual processor time will depend on the speed
   1619     *   of the processor and the details of the specific pattern, but will
   1620     *   typically be on the order of milliseconds.
   1621     *   <p>
   1622     *   By default, the matching time is not limited.
   1623     *   <p>
   1624     *
   1625     *   @param   limit       The limit value, or 0 for no limit.
   1626     *   @param   status      A reference to a UErrorCode to receive any errors.
   1627     *   @stable ICU 4.0
   1628     */
   1629     virtual void setTimeLimit(int32_t limit, UErrorCode &status);
   1630 
   1631   /**
   1632     * Get the time limit, if any, for match operations made with this Matcher.
   1633     *
   1634     *   @return the maximum allowed time for a match, in units of processing steps.
   1635     *   @stable ICU 4.0
   1636     */
   1637     virtual int32_t getTimeLimit() const;
   1638 
   1639   /**
   1640     *  Set the amount of heap storage available for use by the match backtracking stack.
   1641     *  The matcher is also reset, discarding any results from previous matches.
   1642     *  <p>
   1643     *  ICU uses a backtracking regular expression engine, with the backtrack stack
   1644     *  maintained on the heap.  This function sets the limit to the amount of memory
   1645     *  that can be used  for this purpose.  A backtracking stack overflow will
   1646     *  result in an error from the match operation that caused it.
   1647     *  <p>
   1648     *  A limit is desirable because a malicious or poorly designed pattern can use
   1649     *  excessive memory, potentially crashing the process.  A limit is enabled
   1650     *  by default.
   1651     *  <p>
   1652     *  @param limit  The maximum size, in bytes, of the matching backtrack stack.
   1653     *                A value of zero means no limit.
   1654     *                The limit must be greater or equal to zero.
   1655     *
   1656     *  @param status   A reference to a UErrorCode to receive any errors.
   1657     *
   1658     *  @stable ICU 4.0
   1659     */
   1660     virtual void setStackLimit(int32_t  limit, UErrorCode &status);
   1661 
   1662   /**
   1663     *  Get the size of the heap storage available for use by the back tracking stack.
   1664     *
   1665     *  @return  the maximum backtracking stack size, in bytes, or zero if the
   1666     *           stack size is unlimited.
   1667     *  @stable ICU 4.0
   1668     */
   1669     virtual int32_t  getStackLimit() const;
   1670 
   1671 
   1672   /**
   1673     * Set a callback function for use with this Matcher.
   1674     * During matching operations the function will be called periodically,
   1675     * giving the application the opportunity to terminate a long-running
   1676     * match.
   1677     *
   1678     *    @param   callback    A pointer to the user-supplied callback function.
   1679     *    @param   context     User context pointer.  The value supplied at the
   1680     *                         time the callback function is set will be saved
   1681     *                         and passed to the callback each time that it is called.
   1682     *    @param   status      A reference to a UErrorCode to receive any errors.
   1683     *  @stable ICU 4.0
   1684     */
   1685     virtual void setMatchCallback(URegexMatchCallback     *callback,
   1686                                   const void              *context,
   1687                                   UErrorCode              &status);
   1688 
   1689 
   1690   /**
   1691     *  Get the callback function for this URegularExpression.
   1692     *
   1693     *    @param   callback    Out parameter, receives a pointer to the user-supplied
   1694     *                         callback function.
   1695     *    @param   context     Out parameter, receives the user context pointer that
   1696     *                         was set when uregex_setMatchCallback() was called.
   1697     *    @param   status      A reference to a UErrorCode to receive any errors.
   1698     *    @stable ICU 4.0
   1699     */
   1700     virtual void getMatchCallback(URegexMatchCallback     *&callback,
   1701                                   const void              *&context,
   1702                                   UErrorCode              &status);
   1703 
   1704 
   1705   /**
   1706     * Set a progress callback function for use with find operations on this Matcher.
   1707     * During find operations, the callback will be invoked after each return from a
   1708     * match attempt, giving the application the opportunity to terminate a long-running
   1709     * find operation.
   1710     *
   1711     *    @param   callback    A pointer to the user-supplied callback function.
   1712     *    @param   context     User context pointer.  The value supplied at the
   1713     *                         time the callback function is set will be saved
   1714     *                         and passed to the callback each time that it is called.
   1715     *    @param   status      A reference to a UErrorCode to receive any errors.
   1716     *    @stable ICU 4.6
   1717     */
   1718     virtual void setFindProgressCallback(URegexFindProgressCallback      *callback,
   1719                                               const void                              *context,
   1720                                               UErrorCode                              &status);
   1721 
   1722 
   1723   /**
   1724     *  Get the find progress callback function for this URegularExpression.
   1725     *
   1726     *    @param   callback    Out parameter, receives a pointer to the user-supplied
   1727     *                         callback function.
   1728     *    @param   context     Out parameter, receives the user context pointer that
   1729     *                         was set when uregex_setFindProgressCallback() was called.
   1730     *    @param   status      A reference to a UErrorCode to receive any errors.
   1731     *    @stable ICU 4.6
   1732     */
   1733     virtual void getFindProgressCallback(URegexFindProgressCallback      *&callback,
   1734                                               const void                      *&context,
   1735                                               UErrorCode                      &status);
   1736 
   1737 #ifndef U_HIDE_INTERNAL_API
   1738    /**
   1739      *   setTrace   Debug function, enable/disable tracing of the matching engine.
   1740      *              For internal ICU development use only.  DO NO USE!!!!
   1741      *   @internal
   1742      */
   1743     void setTrace(UBool state);
   1744 #endif  /* U_HIDE_INTERNAL_API */
   1745 
   1746     /**
   1747     * ICU "poor man's RTTI", returns a UClassID for this class.
   1748     *
   1749     * @stable ICU 2.2
   1750     */
   1751     static UClassID U_EXPORT2 getStaticClassID();
   1752 
   1753     /**
   1754      * ICU "poor man's RTTI", returns a UClassID for the actual class.
   1755      *
   1756      * @stable ICU 2.2
   1757      */
   1758     virtual UClassID getDynamicClassID() const;
   1759 
   1760 private:
   1761     // Constructors and other object boilerplate are private.
   1762     // Instances of RegexMatcher can not be assigned, copied, cloned, etc.
   1763     RegexMatcher();                  // default constructor not implemented
   1764     RegexMatcher(const RegexPattern *pat);
   1765     RegexMatcher(const RegexMatcher &other);
   1766     RegexMatcher &operator =(const RegexMatcher &rhs);
   1767     void init(UErrorCode &status);                      // Common initialization
   1768     void init2(UText *t, UErrorCode &e);  // Common initialization, part 2.
   1769 
   1770     friend class RegexPattern;
   1771     friend class RegexCImpl;
   1772 public:
   1773 #ifndef U_HIDE_INTERNAL_API
   1774     /** @internal  */
   1775     void resetPreserveRegion();  // Reset matcher state, but preserve any region.
   1776 #endif  /* U_HIDE_INTERNAL_API */
   1777 private:
   1778 
   1779     //
   1780     //  MatchAt   This is the internal interface to the match engine itself.
   1781     //            Match status comes back in matcher member variables.
   1782     //
   1783     void                 MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status);
   1784     inline void          backTrack(int64_t &inputIdx, int32_t &patIdx);
   1785     UBool                isWordBoundary(int64_t pos);         // perform Perl-like  \b test
   1786     UBool                isUWordBoundary(int64_t pos);        // perform RBBI based \b test
   1787     REStackFrame        *resetStack();
   1788     inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status);
   1789     void                 IncrementTime(UErrorCode &status);
   1790 
   1791     // Call user find callback function, if set. Return TRUE if operation should be interrupted.
   1792     inline UBool         findProgressInterrupt(int64_t matchIndex, UErrorCode &status);
   1793 
   1794     int64_t              appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const;
   1795 
   1796     UBool                findUsingChunk(UErrorCode &status);
   1797     void                 MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
   1798     UBool                isChunkWordBoundary(int32_t pos);
   1799 
   1800     const RegexPattern  *fPattern;
   1801     RegexPattern        *fPatternOwned;    // Non-NULL if this matcher owns the pattern, and
   1802                                            //   should delete it when through.
   1803 
   1804     const UnicodeString *fInput;           // The string being matched. Only used for input()
   1805     UText               *fInputText;       // The text being matched. Is never NULL.
   1806     UText               *fAltInputText;    // A shallow copy of the text being matched.
   1807                                            //   Only created if the pattern contains backreferences.
   1808     int64_t              fInputLength;     // Full length of the input text.
   1809     int32_t              fFrameSize;       // The size of a frame in the backtrack stack.
   1810 
   1811     int64_t              fRegionStart;     // Start of the input region, default = 0.
   1812     int64_t              fRegionLimit;     // End of input region, default to input.length.
   1813 
   1814     int64_t              fAnchorStart;     // Region bounds for anchoring operations (^ or $).
   1815     int64_t              fAnchorLimit;     //   See useAnchoringBounds
   1816 
   1817     int64_t              fLookStart;       // Region bounds for look-ahead/behind and
   1818     int64_t              fLookLimit;       //   and other boundary tests.  See
   1819                                            //   useTransparentBounds
   1820 
   1821     int64_t              fActiveStart;     // Currently active bounds for matching.
   1822     int64_t              fActiveLimit;     //   Usually is the same as region, but
   1823                                            //   is changed to fLookStart/Limit when
   1824                                            //   entering look around regions.
   1825 
   1826     UBool                fTransparentBounds;  // True if using transparent bounds.
   1827     UBool                fAnchoringBounds; // True if using anchoring bounds.
   1828 
   1829     UBool                fMatch;           // True if the last attempted match was successful.
   1830     int64_t              fMatchStart;      // Position of the start of the most recent match
   1831     int64_t              fMatchEnd;        // First position after the end of the most recent match
   1832                                            //   Zero if no previous match, even when a region
   1833                                            //   is active.
   1834     int64_t              fLastMatchEnd;    // First position after the end of the previous match,
   1835                                            //   or -1 if there was no previous match.
   1836     int64_t              fAppendPosition;  // First position after the end of the previous
   1837                                            //   appendReplacement().  As described by the
   1838                                            //   JavaDoc for Java Matcher, where it is called
   1839                                            //   "append position"
   1840     UBool                fHitEnd;          // True if the last match touched the end of input.
   1841     UBool                fRequireEnd;      // True if the last match required end-of-input
   1842                                            //    (matched $ or Z)
   1843 
   1844     UVector64           *fStack;
   1845     REStackFrame        *fFrame;           // After finding a match, the last active stack frame,
   1846                                            //   which will contain the capture group results.
   1847                                            //   NOT valid while match engine is running.
   1848 
   1849     int64_t             *fData;            // Data area for use by the compiled pattern.
   1850     int64_t             fSmallData[8];     //   Use this for data if it's enough.
   1851 
   1852     int32_t             fTimeLimit;        // Max time (in arbitrary steps) to let the
   1853                                            //   match engine run.  Zero for unlimited.
   1854 
   1855     int32_t             fTime;             // Match time, accumulates while matching.
   1856     int32_t             fTickCounter;      // Low bits counter for time.  Counts down StateSaves.
   1857                                            //   Kept separately from fTime to keep as much
   1858                                            //   code as possible out of the inline
   1859                                            //   StateSave function.
   1860 
   1861     int32_t             fStackLimit;       // Maximum memory size to use for the backtrack
   1862                                            //   stack, in bytes.  Zero for unlimited.
   1863 
   1864     URegexMatchCallback *fCallbackFn;       // Pointer to match progress callback funct.
   1865                                            //   NULL if there is no callback.
   1866     const void         *fCallbackContext;  // User Context ptr for callback function.
   1867 
   1868     URegexFindProgressCallback  *fFindProgressCallbackFn;  // Pointer to match progress callback funct.
   1869                                                            //   NULL if there is no callback.
   1870     const void         *fFindProgressCallbackContext;      // User Context ptr for callback function.
   1871 
   1872 
   1873     UBool               fInputUniStrMaybeMutable;  // Set when fInputText wraps a UnicodeString that may be mutable - compatibility.
   1874 
   1875     UBool               fTraceDebug;       // Set true for debug tracing of match engine.
   1876 
   1877     UErrorCode          fDeferredStatus;   // Save error state that cannot be immediately
   1878                                            //   reported, or that permanently disables this matcher.
   1879 
   1880     RuleBasedBreakIterator  *fWordBreakItr;
   1881 };
   1882 
   1883 U_NAMESPACE_END
   1884 #endif  // UCONFIG_NO_REGULAR_EXPRESSIONS
   1885 #endif
   1886