Home | History | Annotate | Download | only in unicode
      1 /*
      2 **********************************************************************
      3 *   Copyright (C) 2001-2011 IBM and others. All rights reserved.
      4 **********************************************************************
      5 *   Date        Name        Description
      6 *  03/22/2000   helena      Creation.
      7 **********************************************************************
      8 */
      9 
     10 #ifndef SEARCH_H
     11 #define SEARCH_H
     12 
     13 #include "unicode/utypes.h"
     14 
     15 /**
     16  * \file
     17  * \brief C++ API: SearchIterator object.
     18  */
     19 
     20 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
     21 
     22 #include "unicode/uobject.h"
     23 #include "unicode/unistr.h"
     24 #include "unicode/chariter.h"
     25 #include "unicode/brkiter.h"
     26 #include "unicode/usearch.h"
     27 
     28 /**
     29 * @stable ICU 2.0
     30 */
     31 struct USearch;
     32 /**
     33 * @stable ICU 2.0
     34 */
     35 typedef struct USearch USearch;
     36 
     37 U_NAMESPACE_BEGIN
     38 
     39 /**
     40  *
     41  * <tt>SearchIterator</tt> is an abstract base class that provides
     42  * methods to search for a pattern within a text string. Instances of
     43  * <tt>SearchIterator</tt> maintain a current position and scans over the
     44  * target text, returning the indices the pattern is matched and the length
     45  * of each match.
     46  * <p>
     47  * <tt>SearchIterator</tt> defines a protocol for text searching.
     48  * Subclasses provide concrete implementations of various search algorithms.
     49  * For example, <tt>StringSearch</tt> implements language-sensitive pattern
     50  * matching based on the comparison rules defined in a
     51  * <tt>RuleBasedCollator</tt> object.
     52  * <p>
     53  * Other options for searching includes using a BreakIterator to restrict
     54  * the points at which matches are detected.
     55  * <p>
     56  * <tt>SearchIterator</tt> provides an API that is similar to that of
     57  * other text iteration classes such as <tt>BreakIterator</tt>. Using
     58  * this class, it is easy to scan through text looking for all occurances of
     59  * a given pattern. The following example uses a <tt>StringSearch</tt>
     60  * object to find all instances of "fox" in the target string. Any other
     61  * subclass of <tt>SearchIterator</tt> can be used in an identical
     62  * manner.
     63  * <pre><code>
     64  * UnicodeString target("The quick brown fox jumped over the lazy fox");
     65  * UnicodeString pattern("fox");
     66  *
     67  * SearchIterator *iter  = new StringSearch(pattern, target);
     68  * UErrorCode      error = U_ZERO_ERROR;
     69  * for (int pos = iter->first(error); pos != USEARCH_DONE;
     70  *                               pos = iter->next(error)) {
     71  *     printf("Found match at %d pos, length is %d\n", pos,
     72  *                                             iter.getMatchLength());
     73  * }
     74  * </code></pre>
     75  *
     76  * @see StringSearch
     77  * @see RuleBasedCollator
     78  */
     79 class U_I18N_API SearchIterator : public UObject {
     80 
     81 public:
     82 
     83     // public constructors and destructors -------------------------------
     84 
     85     /**
     86     * Copy constructor that creates a SearchIterator instance with the same
     87     * behavior, and iterating over the same text.
     88     * @param other the SearchIterator instance to be copied.
     89     * @stable ICU 2.0
     90     */
     91     SearchIterator(const SearchIterator &other);
     92 
     93     /**
     94      * Destructor. Cleans up the search iterator data struct.
     95      * @stable ICU 2.0
     96      */
     97     virtual ~SearchIterator();
     98 
     99     // public get and set methods ----------------------------------------
    100 
    101     /**
    102      * Sets the index to point to the given position, and clears any state
    103      * that's affected.
    104      * <p>
    105      * This method takes the argument index and sets the position in the text
    106      * string accordingly without checking if the index is pointing to a
    107      * valid starting point to begin searching.
    108      * @param position within the text to be set. If position is less
    109      *             than or greater than the text range for searching,
    110      *          an U_INDEX_OUTOFBOUNDS_ERROR will be returned
    111      * @param status for errors if it occurs
    112      * @stable ICU 2.0
    113      */
    114     virtual void setOffset(int32_t position, UErrorCode &status) = 0;
    115 
    116     /**
    117      * Return the current index in the text being searched.
    118      * If the iteration has gone past the end of the text
    119      * (or past the beginning for a backwards search), USEARCH_DONE
    120      * is returned.
    121      * @return current index in the text being searched.
    122      * @stable ICU 2.0
    123      */
    124     virtual int32_t getOffset(void) const = 0;
    125 
    126     /**
    127     * Sets the text searching attributes located in the enum
    128     * USearchAttribute with values from the enum USearchAttributeValue.
    129     * USEARCH_DEFAULT can be used for all attributes for resetting.
    130     * @param attribute text attribute (enum USearchAttribute) to be set
    131     * @param value text attribute value
    132     * @param status for errors if it occurs
    133     * @stable ICU 2.0
    134     */
    135     void setAttribute(USearchAttribute       attribute,
    136                       USearchAttributeValue  value,
    137                       UErrorCode            &status);
    138 
    139     /**
    140     * Gets the text searching attributes
    141     * @param attribute text attribute (enum USearchAttribute) to be retrieve
    142     * @return text attribute value
    143     * @stable ICU 2.0
    144     */
    145     USearchAttributeValue getAttribute(USearchAttribute  attribute) const;
    146 
    147     /**
    148     * Returns the index to the match in the text string that was searched.
    149     * This call returns a valid result only after a successful call to
    150     * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
    151     * Just after construction, or after a searching method returns
    152     * <tt>USEARCH_DONE</tt>, this method will return <tt>USEARCH_DONE</tt>.
    153     * <p>
    154     * Use getMatchedLength to get the matched string length.
    155     * @return index of a substring within the text string that is being
    156     *         searched.
    157     * @see #first
    158     * @see #next
    159     * @see #previous
    160     * @see #last
    161     * @stable ICU 2.0
    162     */
    163     int32_t getMatchedStart(void) const;
    164 
    165     /**
    166      * Returns the length of text in the string which matches the search
    167      * pattern. This call returns a valid result only after a successful call
    168      * to <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
    169      * Just after construction, or after a searching method returns
    170      * <tt>USEARCH_DONE</tt>, this method will return 0.
    171      * @return The length of the match in the target text, or 0 if there
    172      *         is no match currently.
    173      * @see #first
    174      * @see #next
    175      * @see #previous
    176      * @see #last
    177      * @stable ICU 2.0
    178      */
    179     int32_t getMatchedLength(void) const;
    180 
    181     /**
    182      * Returns the text that was matched by the most recent call to
    183      * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
    184      * If the iterator is not pointing at a valid match (e.g. just after
    185      * construction or after <tt>USEARCH_DONE</tt> has been returned,
    186      * returns an empty string.
    187      * @param result stores the matched string or an empty string if a match
    188      *        is not found.
    189      * @see #first
    190      * @see #next
    191      * @see #previous
    192      * @see #last
    193      * @stable ICU 2.0
    194      */
    195     void getMatchedText(UnicodeString &result) const;
    196 
    197     /**
    198      * Set the BreakIterator that will be used to restrict the points
    199      * at which matches are detected. The user is responsible for deleting
    200      * the breakiterator.
    201      * @param breakiter A BreakIterator that will be used to restrict the
    202      *                points at which matches are detected. If a match is
    203      *                found, but the match's start or end index is not a
    204      *                boundary as determined by the <tt>BreakIterator</tt>,
    205      *                the match will be rejected and another will be searched
    206      *                for. If this parameter is <tt>NULL</tt>, no break
    207      *                detection is attempted.
    208      * @param status for errors if it occurs
    209      * @see BreakIterator
    210      * @stable ICU 2.0
    211      */
    212     void setBreakIterator(BreakIterator *breakiter, UErrorCode &status);
    213 
    214     /**
    215      * Returns the BreakIterator that is used to restrict the points at
    216      * which matches are detected.  This will be the same object that was
    217      * passed to the constructor or to <tt>setBreakIterator</tt>.
    218      * Note that <tt>NULL</tt> is a legal value; it means that break
    219      * detection should not be attempted.
    220      * @return BreakIterator used to restrict matchings.
    221      * @see #setBreakIterator
    222      * @stable ICU 2.0
    223      */
    224     const BreakIterator * getBreakIterator(void) const;
    225 
    226     /**
    227      * Set the string text to be searched. Text iteration will hence begin at
    228      * the start of the text string. This method is useful if you want to
    229      * re-use an iterator to search for the same pattern within a different
    230      * body of text. The user is responsible for deleting the text.
    231      * @param text string to be searched.
    232      * @param status for errors. If the text length is 0,
    233      *        an U_ILLEGAL_ARGUMENT_ERROR is returned.
    234      * @stable ICU 2.0
    235      */
    236     virtual void setText(const UnicodeString &text, UErrorCode &status);
    237 
    238     /**
    239      * Set the string text to be searched. Text iteration will hence begin at
    240      * the start of the text string. This method is useful if you want to
    241      * re-use an iterator to search for the same pattern within a different
    242      * body of text.
    243      * <p>
    244      * Note: No parsing of the text within the <tt>CharacterIterator</tt>
    245      * will be done during searching for this version. The block of text
    246      * in <tt>CharacterIterator</tt> will be used as it is.
    247      * The user is responsible for deleting the text.
    248      * @param text string iterator to be searched.
    249      * @param status for errors if any. If the text length is 0 then an
    250      *        U_ILLEGAL_ARGUMENT_ERROR is returned.
    251      * @stable ICU 2.0
    252      */
    253     virtual void setText(CharacterIterator &text, UErrorCode &status);
    254 
    255     /**
    256      * Return the string text to be searched.
    257      * @return text string to be searched.
    258      * @stable ICU 2.0
    259      */
    260     const UnicodeString & getText(void) const;
    261 
    262     // operator overloading ----------------------------------------------
    263 
    264     /**
    265      * Equality operator.
    266      * @param that SearchIterator instance to be compared.
    267      * @return TRUE if both BreakIterators are of the same class, have the
    268      *         same behavior, terates over the same text and have the same
    269      *         attributes. FALSE otherwise.
    270      * @stable ICU 2.0
    271      */
    272     virtual UBool operator==(const SearchIterator &that) const;
    273 
    274     /**
    275      * Not-equal operator.
    276      * @param that SearchIterator instance to be compared.
    277      * @return FALSE if operator== returns TRUE, and vice versa.
    278      * @stable ICU 2.0
    279      */
    280     UBool operator!=(const SearchIterator &that) const;
    281 
    282     // public methods ----------------------------------------------------
    283 
    284     /**
    285      * Returns a copy of SearchIterator with the same behavior, and
    286      * iterating over the same text, as this one. Note that all data will be
    287      * replicated, except for the text string to be searched.
    288      * @return cloned object
    289      * @stable ICU 2.0
    290      */
    291     virtual SearchIterator* safeClone(void) const = 0;
    292 
    293     /**
    294      * Returns the first index at which the string text matches the search
    295      * pattern. The iterator is adjusted so that its current index (as
    296      * returned by <tt>getOffset</tt>) is the match position if one
    297      * was found.
    298      * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
    299      * the iterator will be adjusted to the index USEARCH_DONE
    300      * @param  status for errors if it occurs
    301      * @return The character index of the first match, or
    302      *         <tt>USEARCH_DONE</tt> if there are no matches.
    303      * @see #getOffset
    304      * @stable ICU 2.0
    305      */
    306     int32_t first(UErrorCode &status);
    307 
    308     /**
    309      * Returns the first index equal or greater than <tt>position</tt> at which the
    310      * string text matches the search pattern. The iterator is adjusted so
    311      * that its current index (as returned by <tt>getOffset</tt>) is the
    312      * match position if one was found.
    313      * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and the
    314      * iterator will be adjusted to the index <tt>USEARCH_DONE</tt>.
    315      * @param  position where search if to start from. If position is less
    316      *             than or greater than the text range for searching,
    317      *          an U_INDEX_OUTOFBOUNDS_ERROR will be returned
    318      * @param  status for errors if it occurs
    319      * @return The character index of the first match following
    320      *         <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are no
    321      *         matches.
    322      * @see #getOffset
    323      * @stable ICU 2.0
    324      */
    325     int32_t following(int32_t position, UErrorCode &status);
    326 
    327     /**
    328      * Returns the last index in the target text at which it matches the
    329      * search pattern. The iterator is adjusted so that its current index
    330      * (as returned by <tt>getOffset</tt>) is the match position if one was
    331      * found.
    332      * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
    333      * the iterator will be adjusted to the index USEARCH_DONE.
    334      * @param  status for errors if it occurs
    335      * @return The index of the first match, or <tt>USEARCH_DONE</tt> if
    336      *         there are no matches.
    337      * @see #getOffset
    338      * @stable ICU 2.0
    339      */
    340     int32_t last(UErrorCode &status);
    341 
    342     /**
    343      * Returns the first index less than <tt>position</tt> at which the string
    344      * text matches the search pattern. The iterator is adjusted so that its
    345      * current index (as returned by <tt>getOffset</tt>) is the match
    346      * position if one was found. If a match is not found,
    347      * <tt>USEARCH_DONE</tt> will be returned and the iterator will be
    348      * adjusted to the index USEARCH_DONE
    349      * <p>
    350      * When <tt>USEARCH_OVERLAP</tt> option is off, the last index of the
    351      * result match is always less than <tt>position</tt>.
    352      * When <tt>USERARCH_OVERLAP</tt> is on, the result match may span across
    353      * <tt>position</tt>.
    354      *
    355      * @param  position where search is to start from. If position is less
    356      *             than or greater than the text range for searching,
    357      *          an U_INDEX_OUTOFBOUNDS_ERROR will be returned
    358      * @param  status for errors if it occurs
    359      * @return The character index of the first match preceding
    360      *         <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are
    361      *         no matches.
    362      * @see #getOffset
    363      * @stable ICU 2.0
    364      */
    365     int32_t preceding(int32_t position, UErrorCode &status);
    366 
    367     /**
    368      * Returns the index of the next point at which the text matches the
    369      * search pattern, starting from the current position
    370      * The iterator is adjusted so that its current index (as returned by
    371      * <tt>getOffset</tt>) is the match position if one was found.
    372      * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
    373      * the iterator will be adjusted to a position after the end of the text
    374      * string.
    375      * @param  status for errors if it occurs
    376      * @return The index of the next match after the current position,
    377      *          or <tt>USEARCH_DONE</tt> if there are no more matches.
    378      * @see #getOffset
    379      * @stable ICU 2.0
    380      */
    381      int32_t next(UErrorCode &status);
    382 
    383     /**
    384      * Returns the index of the previous point at which the string text
    385      * matches the search pattern, starting at the current position.
    386      * The iterator is adjusted so that its current index (as returned by
    387      * <tt>getOffset</tt>) is the match position if one was found.
    388      * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
    389      * the iterator will be adjusted to the index USEARCH_DONE
    390      * @param  status for errors if it occurs
    391      * @return The index of the previous match before the current position,
    392      *          or <tt>USEARCH_DONE</tt> if there are no more matches.
    393      * @see #getOffset
    394      * @stable ICU 2.0
    395      */
    396     int32_t previous(UErrorCode &status);
    397 
    398     /**
    399     * Resets the iteration.
    400     * Search will begin at the start of the text string if a forward
    401     * iteration is initiated before a backwards iteration. Otherwise if a
    402     * backwards iteration is initiated before a forwards iteration, the
    403     * search will begin at the end of the text string.
    404     * @stable ICU 2.0
    405     */
    406     virtual void reset();
    407 
    408 protected:
    409     // protected data members ---------------------------------------------
    410 
    411     /**
    412     * C search data struct
    413     * @stable ICU 2.0
    414     */
    415     USearch *m_search_;
    416 
    417     /**
    418     * Break iterator.
    419     * Currently the C++ breakiterator does not have getRules etc to reproduce
    420     * another in C. Hence we keep the original around and do the verification
    421     * at the end of the match. The user is responsible for deleting this
    422     * break iterator.
    423     * @stable ICU 2.0
    424     */
    425     BreakIterator *m_breakiterator_;
    426 
    427     /**
    428     * Unicode string version of the search text
    429     * @stable ICU 2.0
    430     */
    431     UnicodeString  m_text_;
    432 
    433     // protected constructors and destructors -----------------------------
    434 
    435     /**
    436     * Default constructor.
    437     * Initializes data to the default values.
    438     * @stable ICU 2.0
    439     */
    440     SearchIterator();
    441 
    442     /**
    443      * Constructor for use by subclasses.
    444      * @param text The target text to be searched.
    445      * @param breakiter A {@link BreakIterator} that is used to restrict the
    446      *                points at which matches are detected. If
    447      *                <tt>handleNext</tt> or <tt>handlePrev</tt> finds a
    448      *                match, but the match's start or end index is not a
    449      *                boundary as determined by the <tt>BreakIterator</tt>,
    450      *                the match is rejected and <tt>handleNext</tt> or
    451      *                <tt>handlePrev</tt> is called again. If this parameter
    452      *                is <tt>NULL</tt>, no break detection is attempted.
    453      * @see #handleNext
    454      * @see #handlePrev
    455      * @stable ICU 2.0
    456      */
    457     SearchIterator(const UnicodeString &text,
    458                          BreakIterator *breakiter = NULL);
    459 
    460     /**
    461      * Constructor for use by subclasses.
    462      * <p>
    463      * Note: No parsing of the text within the <tt>CharacterIterator</tt>
    464      * will be done during searching for this version. The block of text
    465      * in <tt>CharacterIterator</tt> will be used as it is.
    466      * @param text The target text to be searched.
    467      * @param breakiter A {@link BreakIterator} that is used to restrict the
    468      *                points at which matches are detected. If
    469      *                <tt>handleNext</tt> or <tt>handlePrev</tt> finds a
    470      *                match, but the match's start or end index is not a
    471      *                boundary as determined by the <tt>BreakIterator</tt>,
    472      *                the match is rejected and <tt>handleNext</tt> or
    473      *                <tt>handlePrev</tt> is called again. If this parameter
    474      *                is <tt>NULL</tt>, no break detection is attempted.
    475      * @see #handleNext
    476      * @see #handlePrev
    477      * @stable ICU 2.0
    478      */
    479     SearchIterator(CharacterIterator &text, BreakIterator *breakiter = NULL);
    480 
    481     // protected methods --------------------------------------------------
    482 
    483     /**
    484      * Assignment operator. Sets this iterator to have the same behavior,
    485      * and iterate over the same text, as the one passed in.
    486      * @param that instance to be copied.
    487      * @stable ICU 2.0
    488      */
    489     SearchIterator & operator=(const SearchIterator &that);
    490 
    491     /**
    492      * Abstract method which subclasses override to provide the mechanism
    493      * for finding the next match in the target text. This allows different
    494      * subclasses to provide different search algorithms.
    495      * <p>
    496      * If a match is found, the implementation should return the index at
    497      * which the match starts and should call
    498      * <tt>setMatchLength</tt> with the number of characters
    499      * in the target text that make up the match. If no match is found, the
    500      * method should return USEARCH_DONE.
    501      * <p>
    502      * @param position The index in the target text at which the search
    503      *                 should start.
    504      * @param status for error codes if it occurs.
    505      * @return index at which the match starts, else if match is not found
    506      *         USEARCH_DONE is returned
    507      * @see #setMatchLength
    508      * @stable ICU 2.0
    509      */
    510     virtual int32_t handleNext(int32_t position, UErrorCode &status)
    511                                                                          = 0;
    512 
    513     /**
    514      * Abstract method which subclasses override to provide the mechanism for
    515      * finding the previous match in the target text. This allows different
    516      * subclasses to provide different search algorithms.
    517      * <p>
    518      * If a match is found, the implementation should return the index at
    519      * which the match starts and should call
    520      * <tt>setMatchLength</tt> with the number of characters
    521      * in the target text that make up the match. If no match is found, the
    522      * method should return USEARCH_DONE.
    523      * <p>
    524      * @param position The index in the target text at which the search
    525      *                 should start.
    526      * @param status for error codes if it occurs.
    527      * @return index at which the match starts, else if match is not found
    528      *         USEARCH_DONE is returned
    529      * @see #setMatchLength
    530      * @stable ICU 2.0
    531      */
    532      virtual int32_t handlePrev(int32_t position, UErrorCode &status)
    533                                                                          = 0;
    534 
    535     /**
    536      * Sets the length of the currently matched string in the text string to
    537      * be searched.
    538      * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt>
    539      * methods should call this when they find a match in the target text.
    540      * @param length length of the matched text.
    541      * @see #handleNext
    542      * @see #handlePrev
    543      * @stable ICU 2.0
    544      */
    545     virtual void setMatchLength(int32_t length);
    546 
    547     /**
    548      * Sets the offset of the currently matched string in the text string to
    549      * be searched.
    550      * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt>
    551      * methods should call this when they find a match in the target text.
    552      * @param position start offset of the matched text.
    553      * @see #handleNext
    554      * @see #handlePrev
    555      * @stable ICU 2.0
    556      */
    557     virtual void setMatchStart(int32_t position);
    558 
    559     /**
    560     * sets match not found
    561     * @stable ICU 2.0
    562     */
    563     void setMatchNotFound();
    564 };
    565 
    566 inline UBool SearchIterator::operator!=(const SearchIterator &that) const
    567 {
    568    return !operator==(that);
    569 }
    570 U_NAMESPACE_END
    571 
    572 #endif /* #if !UCONFIG_NO_COLLATION */
    573 
    574 #endif
    575 
    576