Home | History | Annotate | Download | only in unicode
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (C) 2001-2011 IBM and others. All rights reserved.
      6 **********************************************************************
      7 *   Date        Name        Description
      8 *  03/22/2000   helena      Creation.
      9 **********************************************************************
     10 */
     11 
     12 #ifndef SEARCH_H
     13 #define SEARCH_H
     14 
     15 #include "unicode/utypes.h"
     16 
     17 /**
     18  * \file
     19  * \brief C++ API: SearchIterator object.
     20  */
     21 
     22 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
     23 
     24 #include "unicode/uobject.h"
     25 #include "unicode/unistr.h"
     26 #include "unicode/chariter.h"
     27 #include "unicode/brkiter.h"
     28 #include "unicode/usearch.h"
     29 
     30 /**
     31 * @stable ICU 2.0
     32 */
     33 struct USearch;
     34 /**
     35 * @stable ICU 2.0
     36 */
     37 typedef struct USearch USearch;
     38 
     39 U_NAMESPACE_BEGIN
     40 
     41 /**
     42  *
     43  * <tt>SearchIterator</tt> is an abstract base class that provides
     44  * methods to search for a pattern within a text string. Instances of
     45  * <tt>SearchIterator</tt> maintain a current position and scans over the
     46  * target text, returning the indices the pattern is matched and the length
     47  * of each match.
     48  * <p>
     49  * <tt>SearchIterator</tt> defines a protocol for text searching.
     50  * Subclasses provide concrete implementations of various search algorithms.
     51  * For example, <tt>StringSearch</tt> implements language-sensitive pattern
     52  * matching based on the comparison rules defined in a
     53  * <tt>RuleBasedCollator</tt> object.
     54  * <p>
     55  * Other options for searching includes using a BreakIterator to restrict
     56  * the points at which matches are detected.
     57  * <p>
     58  * <tt>SearchIterator</tt> provides an API that is similar to that of
     59  * other text iteration classes such as <tt>BreakIterator</tt>. Using
     60  * this class, it is easy to scan through text looking for all occurances of
     61  * a given pattern. The following example uses a <tt>StringSearch</tt>
     62  * object to find all instances of "fox" in the target string. Any other
     63  * subclass of <tt>SearchIterator</tt> can be used in an identical
     64  * manner.
     65  * <pre><code>
     66  * UnicodeString target("The quick brown fox jumped over the lazy fox");
     67  * UnicodeString pattern("fox");
     68  *
     69  * SearchIterator *iter  = new StringSearch(pattern, target);
     70  * UErrorCode      error = U_ZERO_ERROR;
     71  * for (int pos = iter->first(error); pos != USEARCH_DONE;
     72  *                               pos = iter->next(error)) {
     73  *     printf("Found match at %d pos, length is %d\n", pos,
     74  *                                             iter.getMatchLength());
     75  * }
     76  * </code></pre>
     77  *
     78  * @see StringSearch
     79  * @see RuleBasedCollator
     80  */
     81 class U_I18N_API SearchIterator : public UObject {
     82 
     83 public:
     84 
     85     // public constructors and destructors -------------------------------
     86 
     87     /**
     88     * Copy constructor that creates a SearchIterator instance with the same
     89     * behavior, and iterating over the same text.
     90     * @param other the SearchIterator instance to be copied.
     91     * @stable ICU 2.0
     92     */
     93     SearchIterator(const SearchIterator &other);
     94 
     95     /**
     96      * Destructor. Cleans up the search iterator data struct.
     97      * @stable ICU 2.0
     98      */
     99     virtual ~SearchIterator();
    100 
    101     // public get and set methods ----------------------------------------
    102 
    103     /**
    104      * Sets the index to point to the given position, and clears any state
    105      * that's affected.
    106      * <p>
    107      * This method takes the argument index and sets the position in the text
    108      * string accordingly without checking if the index is pointing to a
    109      * valid starting point to begin searching.
    110      * @param position within the text to be set. If position is less
    111      *             than or greater than the text range for searching,
    112      *          an U_INDEX_OUTOFBOUNDS_ERROR will be returned
    113      * @param status for errors if it occurs
    114      * @stable ICU 2.0
    115      */
    116     virtual void setOffset(int32_t position, UErrorCode &status) = 0;
    117 
    118     /**
    119      * Return the current index in the text being searched.
    120      * If the iteration has gone past the end of the text
    121      * (or past the beginning for a backwards search), USEARCH_DONE
    122      * is returned.
    123      * @return current index in the text being searched.
    124      * @stable ICU 2.0
    125      */
    126     virtual int32_t getOffset(void) const = 0;
    127 
    128     /**
    129     * Sets the text searching attributes located in the enum
    130     * USearchAttribute with values from the enum USearchAttributeValue.
    131     * USEARCH_DEFAULT can be used for all attributes for resetting.
    132     * @param attribute text attribute (enum USearchAttribute) to be set
    133     * @param value text attribute value
    134     * @param status for errors if it occurs
    135     * @stable ICU 2.0
    136     */
    137     void setAttribute(USearchAttribute       attribute,
    138                       USearchAttributeValue  value,
    139                       UErrorCode            &status);
    140 
    141     /**
    142     * Gets the text searching attributes
    143     * @param attribute text attribute (enum USearchAttribute) to be retrieve
    144     * @return text attribute value
    145     * @stable ICU 2.0
    146     */
    147     USearchAttributeValue getAttribute(USearchAttribute  attribute) const;
    148 
    149     /**
    150     * Returns the index to the match in the text string that was searched.
    151     * This call returns a valid result only after a successful call to
    152     * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
    153     * Just after construction, or after a searching method returns
    154     * <tt>USEARCH_DONE</tt>, this method will return <tt>USEARCH_DONE</tt>.
    155     * <p>
    156     * Use getMatchedLength to get the matched string length.
    157     * @return index of a substring within the text string that is being
    158     *         searched.
    159     * @see #first
    160     * @see #next
    161     * @see #previous
    162     * @see #last
    163     * @stable ICU 2.0
    164     */
    165     int32_t getMatchedStart(void) const;
    166 
    167     /**
    168      * Returns the length of text in the string which matches the search
    169      * pattern. This call returns a valid result only after a successful call
    170      * to <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
    171      * Just after construction, or after a searching method returns
    172      * <tt>USEARCH_DONE</tt>, this method will return 0.
    173      * @return The length of the match in the target text, or 0 if there
    174      *         is no match currently.
    175      * @see #first
    176      * @see #next
    177      * @see #previous
    178      * @see #last
    179      * @stable ICU 2.0
    180      */
    181     int32_t getMatchedLength(void) const;
    182 
    183     /**
    184      * Returns the text that was matched by the most recent call to
    185      * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
    186      * If the iterator is not pointing at a valid match (e.g. just after
    187      * construction or after <tt>USEARCH_DONE</tt> has been returned,
    188      * returns an empty string.
    189      * @param result stores the matched string or an empty string if a match
    190      *        is not found.
    191      * @see #first
    192      * @see #next
    193      * @see #previous
    194      * @see #last
    195      * @stable ICU 2.0
    196      */
    197     void getMatchedText(UnicodeString &result) const;
    198 
    199     /**
    200      * Set the BreakIterator that will be used to restrict the points
    201      * at which matches are detected. The user is responsible for deleting
    202      * the breakiterator.
    203      * @param breakiter A BreakIterator that will be used to restrict the
    204      *                points at which matches are detected. If a match is
    205      *                found, but the match's start or end index is not a
    206      *                boundary as determined by the <tt>BreakIterator</tt>,
    207      *                the match will be rejected and another will be searched
    208      *                for. If this parameter is <tt>NULL</tt>, no break
    209      *                detection is attempted.
    210      * @param status for errors if it occurs
    211      * @see BreakIterator
    212      * @stable ICU 2.0
    213      */
    214     void setBreakIterator(BreakIterator *breakiter, UErrorCode &status);
    215 
    216     /**
    217      * Returns the BreakIterator that is used to restrict the points at
    218      * which matches are detected.  This will be the same object that was
    219      * passed to the constructor or to <tt>setBreakIterator</tt>.
    220      * Note that <tt>NULL</tt> is a legal value; it means that break
    221      * detection should not be attempted.
    222      * @return BreakIterator used to restrict matchings.
    223      * @see #setBreakIterator
    224      * @stable ICU 2.0
    225      */
    226     const BreakIterator * getBreakIterator(void) const;
    227 
    228     /**
    229      * Set the string text to be searched. Text iteration will hence begin at
    230      * the start of the text string. This method is useful if you want to
    231      * re-use an iterator to search for the same pattern within a different
    232      * body of text. The user is responsible for deleting the text.
    233      * @param text string to be searched.
    234      * @param status for errors. If the text length is 0,
    235      *        an U_ILLEGAL_ARGUMENT_ERROR is returned.
    236      * @stable ICU 2.0
    237      */
    238     virtual void setText(const UnicodeString &text, UErrorCode &status);
    239 
    240     /**
    241      * Set the string text to be searched. Text iteration will hence begin at
    242      * the start of the text string. This method is useful if you want to
    243      * re-use an iterator to search for the same pattern within a different
    244      * body of text.
    245      * <p>
    246      * Note: No parsing of the text within the <tt>CharacterIterator</tt>
    247      * will be done during searching for this version. The block of text
    248      * in <tt>CharacterIterator</tt> will be used as it is.
    249      * The user is responsible for deleting the text.
    250      * @param text string iterator to be searched.
    251      * @param status for errors if any. If the text length is 0 then an
    252      *        U_ILLEGAL_ARGUMENT_ERROR is returned.
    253      * @stable ICU 2.0
    254      */
    255     virtual void setText(CharacterIterator &text, UErrorCode &status);
    256 
    257     /**
    258      * Return the string text to be searched.
    259      * @return text string to be searched.
    260      * @stable ICU 2.0
    261      */
    262     const UnicodeString & getText(void) const;
    263 
    264     // operator overloading ----------------------------------------------
    265 
    266     /**
    267      * Equality operator.
    268      * @param that SearchIterator instance to be compared.
    269      * @return TRUE if both BreakIterators are of the same class, have the
    270      *         same behavior, terates over the same text and have the same
    271      *         attributes. FALSE otherwise.
    272      * @stable ICU 2.0
    273      */
    274     virtual UBool operator==(const SearchIterator &that) const;
    275 
    276     /**
    277      * Not-equal operator.
    278      * @param that SearchIterator instance to be compared.
    279      * @return FALSE if operator== returns TRUE, and vice versa.
    280      * @stable ICU 2.0
    281      */
    282     UBool operator!=(const SearchIterator &that) const;
    283 
    284     // public methods ----------------------------------------------------
    285 
    286     /**
    287      * Returns a copy of SearchIterator with the same behavior, and
    288      * iterating over the same text, as this one. Note that all data will be
    289      * replicated, except for the text string to be searched.
    290      * @return cloned object
    291      * @stable ICU 2.0
    292      */
    293     virtual SearchIterator* safeClone(void) const = 0;
    294 
    295     /**
    296      * Returns the first index at which the string text matches the search
    297      * pattern. The iterator is adjusted so that its current index (as
    298      * returned by <tt>getOffset</tt>) is the match position if one
    299      * was found.
    300      * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
    301      * the iterator will be adjusted to the index USEARCH_DONE
    302      * @param  status for errors if it occurs
    303      * @return The character index of the first match, or
    304      *         <tt>USEARCH_DONE</tt> if there are no matches.
    305      * @see #getOffset
    306      * @stable ICU 2.0
    307      */
    308     int32_t first(UErrorCode &status);
    309 
    310     /**
    311      * Returns the first index equal or greater than <tt>position</tt> at which the
    312      * string text matches the search pattern. The iterator is adjusted so
    313      * that its current index (as returned by <tt>getOffset</tt>) is the
    314      * match position if one was found.
    315      * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and the
    316      * iterator will be adjusted to the index <tt>USEARCH_DONE</tt>.
    317      * @param  position where search if to start from. If position is less
    318      *             than or greater than the text range for searching,
    319      *          an U_INDEX_OUTOFBOUNDS_ERROR will be returned
    320      * @param  status for errors if it occurs
    321      * @return The character index of the first match following
    322      *         <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are no
    323      *         matches.
    324      * @see #getOffset
    325      * @stable ICU 2.0
    326      */
    327     int32_t following(int32_t position, UErrorCode &status);
    328 
    329     /**
    330      * Returns the last index in the target text at which it matches the
    331      * search pattern. The iterator is adjusted so that its current index
    332      * (as returned by <tt>getOffset</tt>) is the match position if one was
    333      * found.
    334      * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
    335      * the iterator will be adjusted to the index USEARCH_DONE.
    336      * @param  status for errors if it occurs
    337      * @return The index of the first match, or <tt>USEARCH_DONE</tt> if
    338      *         there are no matches.
    339      * @see #getOffset
    340      * @stable ICU 2.0
    341      */
    342     int32_t last(UErrorCode &status);
    343 
    344     /**
    345      * Returns the first index less than <tt>position</tt> at which the string
    346      * text matches the search pattern. The iterator is adjusted so that its
    347      * current index (as returned by <tt>getOffset</tt>) is the match
    348      * position if one was found. If a match is not found,
    349      * <tt>USEARCH_DONE</tt> will be returned and the iterator will be
    350      * adjusted to the index USEARCH_DONE
    351      * <p>
    352      * When <tt>USEARCH_OVERLAP</tt> option is off, the last index of the
    353      * result match is always less than <tt>position</tt>.
    354      * When <tt>USERARCH_OVERLAP</tt> is on, the result match may span across
    355      * <tt>position</tt>.
    356      *
    357      * @param  position where search is to start from. If position is less
    358      *             than or greater than the text range for searching,
    359      *          an U_INDEX_OUTOFBOUNDS_ERROR will be returned
    360      * @param  status for errors if it occurs
    361      * @return The character index of the first match preceding
    362      *         <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are
    363      *         no matches.
    364      * @see #getOffset
    365      * @stable ICU 2.0
    366      */
    367     int32_t preceding(int32_t position, UErrorCode &status);
    368 
    369     /**
    370      * Returns the index of the next point at which the text matches the
    371      * search pattern, starting from the current position
    372      * The iterator is adjusted so that its current index (as returned by
    373      * <tt>getOffset</tt>) is the match position if one was found.
    374      * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
    375      * the iterator will be adjusted to a position after the end of the text
    376      * string.
    377      * @param  status for errors if it occurs
    378      * @return The index of the next match after the current position,
    379      *          or <tt>USEARCH_DONE</tt> if there are no more matches.
    380      * @see #getOffset
    381      * @stable ICU 2.0
    382      */
    383      int32_t next(UErrorCode &status);
    384 
    385     /**
    386      * Returns the index of the previous point at which the string text
    387      * matches the search pattern, starting at the current position.
    388      * The iterator is adjusted so that its current index (as returned by
    389      * <tt>getOffset</tt>) is the match position if one was found.
    390      * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
    391      * the iterator will be adjusted to the index USEARCH_DONE
    392      * @param  status for errors if it occurs
    393      * @return The index of the previous match before the current position,
    394      *          or <tt>USEARCH_DONE</tt> if there are no more matches.
    395      * @see #getOffset
    396      * @stable ICU 2.0
    397      */
    398     int32_t previous(UErrorCode &status);
    399 
    400     /**
    401     * Resets the iteration.
    402     * Search will begin at the start of the text string if a forward
    403     * iteration is initiated before a backwards iteration. Otherwise if a
    404     * backwards iteration is initiated before a forwards iteration, the
    405     * search will begin at the end of the text string.
    406     * @stable ICU 2.0
    407     */
    408     virtual void reset();
    409 
    410 protected:
    411     // protected data members ---------------------------------------------
    412 
    413     /**
    414     * C search data struct
    415     * @stable ICU 2.0
    416     */
    417     USearch *m_search_;
    418 
    419     /**
    420     * Break iterator.
    421     * Currently the C++ breakiterator does not have getRules etc to reproduce
    422     * another in C. Hence we keep the original around and do the verification
    423     * at the end of the match. The user is responsible for deleting this
    424     * break iterator.
    425     * @stable ICU 2.0
    426     */
    427     BreakIterator *m_breakiterator_;
    428 
    429     /**
    430     * Unicode string version of the search text
    431     * @stable ICU 2.0
    432     */
    433     UnicodeString  m_text_;
    434 
    435     // protected constructors and destructors -----------------------------
    436 
    437     /**
    438     * Default constructor.
    439     * Initializes data to the default values.
    440     * @stable ICU 2.0
    441     */
    442     SearchIterator();
    443 
    444     /**
    445      * Constructor for use by subclasses.
    446      * @param text The target text to be searched.
    447      * @param breakiter A {@link BreakIterator} that is used to restrict the
    448      *                points at which matches are detected. If
    449      *                <tt>handleNext</tt> or <tt>handlePrev</tt> finds a
    450      *                match, but the match's start or end index is not a
    451      *                boundary as determined by the <tt>BreakIterator</tt>,
    452      *                the match is rejected and <tt>handleNext</tt> or
    453      *                <tt>handlePrev</tt> is called again. If this parameter
    454      *                is <tt>NULL</tt>, no break detection is attempted.
    455      * @see #handleNext
    456      * @see #handlePrev
    457      * @stable ICU 2.0
    458      */
    459     SearchIterator(const UnicodeString &text,
    460                          BreakIterator *breakiter = NULL);
    461 
    462     /**
    463      * Constructor for use by subclasses.
    464      * <p>
    465      * Note: No parsing of the text within the <tt>CharacterIterator</tt>
    466      * will be done during searching for this version. The block of text
    467      * in <tt>CharacterIterator</tt> will be used as it is.
    468      * @param text The target text to be searched.
    469      * @param breakiter A {@link BreakIterator} that is used to restrict the
    470      *                points at which matches are detected. If
    471      *                <tt>handleNext</tt> or <tt>handlePrev</tt> finds a
    472      *                match, but the match's start or end index is not a
    473      *                boundary as determined by the <tt>BreakIterator</tt>,
    474      *                the match is rejected and <tt>handleNext</tt> or
    475      *                <tt>handlePrev</tt> is called again. If this parameter
    476      *                is <tt>NULL</tt>, no break detection is attempted.
    477      * @see #handleNext
    478      * @see #handlePrev
    479      * @stable ICU 2.0
    480      */
    481     SearchIterator(CharacterIterator &text, BreakIterator *breakiter = NULL);
    482 
    483     // protected methods --------------------------------------------------
    484 
    485     /**
    486      * Assignment operator. Sets this iterator to have the same behavior,
    487      * and iterate over the same text, as the one passed in.
    488      * @param that instance to be copied.
    489      * @stable ICU 2.0
    490      */
    491     SearchIterator & operator=(const SearchIterator &that);
    492 
    493     /**
    494      * Abstract method which subclasses override to provide the mechanism
    495      * for finding the next match in the target text. This allows different
    496      * subclasses to provide different search algorithms.
    497      * <p>
    498      * If a match is found, the implementation should return the index at
    499      * which the match starts and should call
    500      * <tt>setMatchLength</tt> with the number of characters
    501      * in the target text that make up the match. If no match is found, the
    502      * method should return USEARCH_DONE.
    503      * <p>
    504      * @param position The index in the target text at which the search
    505      *                 should start.
    506      * @param status for error codes if it occurs.
    507      * @return index at which the match starts, else if match is not found
    508      *         USEARCH_DONE is returned
    509      * @see #setMatchLength
    510      * @stable ICU 2.0
    511      */
    512     virtual int32_t handleNext(int32_t position, UErrorCode &status)
    513                                                                          = 0;
    514 
    515     /**
    516      * Abstract method which subclasses override to provide the mechanism for
    517      * finding the previous match in the target text. This allows different
    518      * subclasses to provide different search algorithms.
    519      * <p>
    520      * If a match is found, the implementation should return the index at
    521      * which the match starts and should call
    522      * <tt>setMatchLength</tt> with the number of characters
    523      * in the target text that make up the match. If no match is found, the
    524      * method should return USEARCH_DONE.
    525      * <p>
    526      * @param position The index in the target text at which the search
    527      *                 should start.
    528      * @param status for error codes if it occurs.
    529      * @return index at which the match starts, else if match is not found
    530      *         USEARCH_DONE is returned
    531      * @see #setMatchLength
    532      * @stable ICU 2.0
    533      */
    534      virtual int32_t handlePrev(int32_t position, UErrorCode &status)
    535                                                                          = 0;
    536 
    537     /**
    538      * Sets the length of the currently matched string in the text string to
    539      * be searched.
    540      * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt>
    541      * methods should call this when they find a match in the target text.
    542      * @param length length of the matched text.
    543      * @see #handleNext
    544      * @see #handlePrev
    545      * @stable ICU 2.0
    546      */
    547     virtual void setMatchLength(int32_t length);
    548 
    549     /**
    550      * Sets the offset of the currently matched string in the text string to
    551      * be searched.
    552      * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt>
    553      * methods should call this when they find a match in the target text.
    554      * @param position start offset of the matched text.
    555      * @see #handleNext
    556      * @see #handlePrev
    557      * @stable ICU 2.0
    558      */
    559     virtual void setMatchStart(int32_t position);
    560 
    561     /**
    562     * sets match not found
    563     * @stable ICU 2.0
    564     */
    565     void setMatchNotFound();
    566 };
    567 
    568 inline UBool SearchIterator::operator!=(const SearchIterator &that) const
    569 {
    570    return !operator==(that);
    571 }
    572 U_NAMESPACE_END
    573 
    574 #endif /* #if !UCONFIG_NO_COLLATION */
    575 
    576 #endif
    577 
    578