Home | History | Annotate | Download | only in unicode
      1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 * Copyright (c) 2002-2014, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 */
      9 #ifndef USETITER_H
     10 #define USETITER_H
     11 
     12 #include "unicode/utypes.h"
     13 #include "unicode/uobject.h"
     14 #include "unicode/unistr.h"
     15 
     16 /**
     17  * \file
     18  * \brief C++ API: UnicodeSetIterator iterates over the contents of a UnicodeSet.
     19  */
     20 
     21 U_NAMESPACE_BEGIN
     22 
     23 class UnicodeSet;
     24 class UnicodeString;
     25 
     26 /**
     27  *
     28  * UnicodeSetIterator iterates over the contents of a UnicodeSet.  It
     29  * iterates over either code points or code point ranges.  After all
     30  * code points or ranges have been returned, it returns the
     31  * multicharacter strings of the UnicodeSet, if any.
     32  *
     33  * This class is not intended to be subclassed.  Consider any fields
     34  *  or methods declared as "protected" to be private.  The use of
     35  *  protected in this class is an artifact of history.
     36  *
     37  * <p>To iterate over code points and strings, use a loop like this:
     38  * <pre>
     39  * UnicodeSetIterator it(set);
     40  * while (it.next()) {
     41  *     processItem(it.getString());
     42  * }
     43  * </pre>
     44  * <p>Each item in the set is accessed as a string.  Set elements
     45  *    consisting of single code points are returned as strings containing
     46  *    just the one code point.
     47  *
     48  * <p>To iterate over code point ranges, instead of individual code points,
     49  *    use a loop like this:
     50  * <pre>
     51  * UnicodeSetIterator it(set);
     52  * while (it.nextRange()) {
     53  *   if (it.isString()) {
     54  *     processString(it.getString());
     55  *   } else {
     56  *     processCodepointRange(it.getCodepoint(), it.getCodepointEnd());
     57  *   }
     58  * }
     59  * </pre>
     60  * @author M. Davis
     61  * @stable ICU 2.4
     62  */
     63 class U_COMMON_API UnicodeSetIterator : public UObject {
     64 
     65  protected:
     66 
     67     /**
     68      * Value of <tt>codepoint</tt> if the iterator points to a string.
     69      * If <tt>codepoint == IS_STRING</tt>, then examine
     70      * <tt>string</tt> for the current iteration result.
     71      * @stable ICU 2.4
     72      */
     73     enum { IS_STRING = -1 };
     74 
     75     /**
     76      * Current code point, or the special value <tt>IS_STRING</tt>, if
     77      * the iterator points to a string.
     78      * @stable ICU 2.4
     79      */
     80     UChar32 codepoint;
     81 
     82     /**
     83      * When iterating over ranges using <tt>nextRange()</tt>,
     84      * <tt>codepointEnd</tt> contains the inclusive end of the
     85      * iteration range, if <tt>codepoint != IS_STRING</tt>.  If
     86      * iterating over code points using <tt>next()</tt>, or if
     87      * <tt>codepoint == IS_STRING</tt>, then the value of
     88      * <tt>codepointEnd</tt> is undefined.
     89      * @stable ICU 2.4
     90      */
     91     UChar32 codepointEnd;
     92 
     93     /**
     94      * If <tt>codepoint == IS_STRING</tt>, then <tt>string</tt> points
     95      * to the current string.  If <tt>codepoint != IS_STRING</tt>, the
     96      * value of <tt>string</tt> is undefined.
     97      * @stable ICU 2.4
     98      */
     99     const UnicodeString* string;
    100 
    101  public:
    102 
    103     /**
    104      * Create an iterator over the given set.  The iterator is valid
    105      * only so long as <tt>set</tt> is valid.
    106      * @param set set to iterate over
    107      * @stable ICU 2.4
    108      */
    109     UnicodeSetIterator(const UnicodeSet& set);
    110 
    111     /**
    112      * Create an iterator over nothing.  <tt>next()</tt> and
    113      * <tt>nextRange()</tt> return false. This is a convenience
    114      * constructor allowing the target to be set later.
    115      * @stable ICU 2.4
    116      */
    117     UnicodeSetIterator();
    118 
    119     /**
    120      * Destructor.
    121      * @stable ICU 2.4
    122      */
    123     virtual ~UnicodeSetIterator();
    124 
    125     /**
    126      * Returns true if the current element is a string.  If so, the
    127      * caller can retrieve it with <tt>getString()</tt>.  If this
    128      * method returns false, the current element is a code point or
    129      * code point range, depending on whether <tt>next()</tt> or
    130      * <tt>nextRange()</tt> was called.
    131      * Elements of types string and codepoint can both be retrieved
    132      * with the function <tt>getString()</tt>.
    133      * Elements of type codepoint can also be retrieved with
    134      * <tt>getCodepoint()</tt>.
    135      * For ranges, <tt>getCodepoint()</tt> returns the starting codepoint
    136      * of the range, and <tt>getCodepointEnd()</tt> returns the end
    137      * of the range.
    138      * @stable ICU 2.4
    139      */
    140     inline UBool isString() const;
    141 
    142     /**
    143      * Returns the current code point, if <tt>isString()</tt> returned
    144      * false.  Otherwise returns an undefined result.
    145      * @stable ICU 2.4
    146      */
    147     inline UChar32 getCodepoint() const;
    148 
    149     /**
    150      * Returns the end of the current code point range, if
    151      * <tt>isString()</tt> returned false and <tt>nextRange()</tt> was
    152      * called.  Otherwise returns an undefined result.
    153      * @stable ICU 2.4
    154      */
    155     inline UChar32 getCodepointEnd() const;
    156 
    157     /**
    158      * Returns the current string, if <tt>isString()</tt> returned
    159      * true.  If the current iteration item is a code point, a UnicodeString
    160      * containing that single code point is returned.
    161      *
    162      * Ownership of the returned string remains with the iterator.
    163      * The string is guaranteed to remain valid only until the iterator is
    164      *   advanced to the next item, or until the iterator is deleted.
    165      *
    166      * @stable ICU 2.4
    167      */
    168     const UnicodeString& getString();
    169 
    170     /**
    171      * Advances the iteration position to the next element in the set,
    172      * which can be either a single code point or a string.
    173      * If there are no more elements in the set, return false.
    174      *
    175      * <p>
    176      * If <tt>isString() == TRUE</tt>, the value is a
    177      * string, otherwise the value is a
    178      * single code point.  Elements of either type can be retrieved
    179      * with the function <tt>getString()</tt>, while elements of
    180      * consisting of a single code point can be retrieved with
    181      * <tt>getCodepoint()</tt>
    182      *
    183      * <p>The order of iteration is all code points in sorted order,
    184      * followed by all strings sorted order.    Do not mix
    185      * calls to <tt>next()</tt> and <tt>nextRange()</tt> without
    186      * calling <tt>reset()</tt> between them.  The results of doing so
    187      * are undefined.
    188      *
    189      * @return true if there was another element in the set.
    190      * @stable ICU 2.4
    191      */
    192     UBool next();
    193 
    194     /**
    195      * Returns the next element in the set, either a code point range
    196      * or a string.  If there are no more elements in the set, return
    197      * false.  If <tt>isString() == TRUE</tt>, the value is a
    198      * string and can be accessed with <tt>getString()</tt>.  Otherwise the value is a
    199      * range of one or more code points from <tt>getCodepoint()</tt> to
    200      * <tt>getCodepointeEnd()</tt> inclusive.
    201      *
    202      * <p>The order of iteration is all code points ranges in sorted
    203      * order, followed by all strings sorted order.  Ranges are
    204      * disjoint and non-contiguous.  The value returned from <tt>getString()</tt>
    205      * is undefined unless <tt>isString() == TRUE</tt>.  Do not mix calls to
    206      * <tt>next()</tt> and <tt>nextRange()</tt> without calling
    207      * <tt>reset()</tt> between them.  The results of doing so are
    208      * undefined.
    209      *
    210      * @return true if there was another element in the set.
    211      * @stable ICU 2.4
    212      */
    213     UBool nextRange();
    214 
    215     /**
    216      * Sets this iterator to visit the elements of the given set and
    217      * resets it to the start of that set.  The iterator is valid only
    218      * so long as <tt>set</tt> is valid.
    219      * @param set the set to iterate over.
    220      * @stable ICU 2.4
    221      */
    222     void reset(const UnicodeSet& set);
    223 
    224     /**
    225      * Resets this iterator to the start of the set.
    226      * @stable ICU 2.4
    227      */
    228     void reset();
    229 
    230     /**
    231      * ICU "poor man's RTTI", returns a UClassID for this class.
    232      *
    233      * @stable ICU 2.4
    234      */
    235     static UClassID U_EXPORT2 getStaticClassID();
    236 
    237     /**
    238      * ICU "poor man's RTTI", returns a UClassID for the actual class.
    239      *
    240      * @stable ICU 2.4
    241      */
    242     virtual UClassID getDynamicClassID() const;
    243 
    244     // ======================= PRIVATES ===========================
    245 
    246  protected:
    247 
    248     // endElement and nextElements are really UChar32's, but we keep
    249     // them as signed int32_t's so we can do comparisons with
    250     // endElement set to -1.  Leave them as int32_t's.
    251     /** The set
    252      * @stable ICU 2.4
    253      */
    254     const UnicodeSet* set;
    255     /** End range
    256      * @stable ICU 2.4
    257      */
    258     int32_t endRange;
    259     /** Range
    260      * @stable ICU 2.4
    261      */
    262     int32_t range;
    263     /** End element
    264      * @stable ICU 2.4
    265      */
    266     int32_t endElement;
    267     /** Next element
    268      * @stable ICU 2.4
    269      */
    270     int32_t nextElement;
    271     //UBool abbreviated;
    272     /** Next string
    273      * @stable ICU 2.4
    274      */
    275     int32_t nextString;
    276     /** String count
    277      * @stable ICU 2.4
    278      */
    279     int32_t stringCount;
    280 
    281     /**
    282      *  Points to the string to use when the caller asks for a
    283      *  string and the current iteration item is a code point, not a string.
    284      *  @internal
    285      */
    286     UnicodeString *cpString;
    287 
    288     /** Copy constructor. Disallowed.
    289      * @stable ICU 2.4
    290      */
    291     UnicodeSetIterator(const UnicodeSetIterator&); // disallow
    292 
    293     /** Assignment operator. Disallowed.
    294      * @stable ICU 2.4
    295      */
    296     UnicodeSetIterator& operator=(const UnicodeSetIterator&); // disallow
    297 
    298     /** Load range
    299      * @stable ICU 2.4
    300      */
    301     virtual void loadRange(int32_t range);
    302 
    303 };
    304 
    305 inline UBool UnicodeSetIterator::isString() const {
    306     return codepoint == (UChar32)IS_STRING;
    307 }
    308 
    309 inline UChar32 UnicodeSetIterator::getCodepoint() const {
    310     return codepoint;
    311 }
    312 
    313 inline UChar32 UnicodeSetIterator::getCodepointEnd() const {
    314     return codepointEnd;
    315 }
    316 
    317 
    318 U_NAMESPACE_END
    319 
    320 #endif
    321