1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (c) 2002-2014, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 */ 9 #ifndef USETITER_H 10 #define USETITER_H 11 12 #include "unicode/utypes.h" 13 #include "unicode/uobject.h" 14 #include "unicode/unistr.h" 15 16 /** 17 * \file 18 * \brief C++ API: UnicodeSetIterator iterates over the contents of a UnicodeSet. 19 */ 20 21 U_NAMESPACE_BEGIN 22 23 class UnicodeSet; 24 class UnicodeString; 25 26 /** 27 * 28 * UnicodeSetIterator iterates over the contents of a UnicodeSet. It 29 * iterates over either code points or code point ranges. After all 30 * code points or ranges have been returned, it returns the 31 * multicharacter strings of the UnicodeSet, if any. 32 * 33 * This class is not intended to be subclassed. Consider any fields 34 * or methods declared as "protected" to be private. The use of 35 * protected in this class is an artifact of history. 36 * 37 * <p>To iterate over code points and strings, use a loop like this: 38 * <pre> 39 * UnicodeSetIterator it(set); 40 * while (it.next()) { 41 * processItem(it.getString()); 42 * } 43 * </pre> 44 * <p>Each item in the set is accessed as a string. Set elements 45 * consisting of single code points are returned as strings containing 46 * just the one code point. 47 * 48 * <p>To iterate over code point ranges, instead of individual code points, 49 * use a loop like this: 50 * <pre> 51 * UnicodeSetIterator it(set); 52 * while (it.nextRange()) { 53 * if (it.isString()) { 54 * processString(it.getString()); 55 * } else { 56 * processCodepointRange(it.getCodepoint(), it.getCodepointEnd()); 57 * } 58 * } 59 * </pre> 60 * @author M. Davis 61 * @stable ICU 2.4 62 */ 63 class U_COMMON_API UnicodeSetIterator : public UObject { 64 65 protected: 66 67 /** 68 * Value of <tt>codepoint</tt> if the iterator points to a string. 69 * If <tt>codepoint == IS_STRING</tt>, then examine 70 * <tt>string</tt> for the current iteration result. 71 * @stable ICU 2.4 72 */ 73 enum { IS_STRING = -1 }; 74 75 /** 76 * Current code point, or the special value <tt>IS_STRING</tt>, if 77 * the iterator points to a string. 78 * @stable ICU 2.4 79 */ 80 UChar32 codepoint; 81 82 /** 83 * When iterating over ranges using <tt>nextRange()</tt>, 84 * <tt>codepointEnd</tt> contains the inclusive end of the 85 * iteration range, if <tt>codepoint != IS_STRING</tt>. If 86 * iterating over code points using <tt>next()</tt>, or if 87 * <tt>codepoint == IS_STRING</tt>, then the value of 88 * <tt>codepointEnd</tt> is undefined. 89 * @stable ICU 2.4 90 */ 91 UChar32 codepointEnd; 92 93 /** 94 * If <tt>codepoint == IS_STRING</tt>, then <tt>string</tt> points 95 * to the current string. If <tt>codepoint != IS_STRING</tt>, the 96 * value of <tt>string</tt> is undefined. 97 * @stable ICU 2.4 98 */ 99 const UnicodeString* string; 100 101 public: 102 103 /** 104 * Create an iterator over the given set. The iterator is valid 105 * only so long as <tt>set</tt> is valid. 106 * @param set set to iterate over 107 * @stable ICU 2.4 108 */ 109 UnicodeSetIterator(const UnicodeSet& set); 110 111 /** 112 * Create an iterator over nothing. <tt>next()</tt> and 113 * <tt>nextRange()</tt> return false. This is a convenience 114 * constructor allowing the target to be set later. 115 * @stable ICU 2.4 116 */ 117 UnicodeSetIterator(); 118 119 /** 120 * Destructor. 121 * @stable ICU 2.4 122 */ 123 virtual ~UnicodeSetIterator(); 124 125 /** 126 * Returns true if the current element is a string. If so, the 127 * caller can retrieve it with <tt>getString()</tt>. If this 128 * method returns false, the current element is a code point or 129 * code point range, depending on whether <tt>next()</tt> or 130 * <tt>nextRange()</tt> was called. 131 * Elements of types string and codepoint can both be retrieved 132 * with the function <tt>getString()</tt>. 133 * Elements of type codepoint can also be retrieved with 134 * <tt>getCodepoint()</tt>. 135 * For ranges, <tt>getCodepoint()</tt> returns the starting codepoint 136 * of the range, and <tt>getCodepointEnd()</tt> returns the end 137 * of the range. 138 * @stable ICU 2.4 139 */ 140 inline UBool isString() const; 141 142 /** 143 * Returns the current code point, if <tt>isString()</tt> returned 144 * false. Otherwise returns an undefined result. 145 * @stable ICU 2.4 146 */ 147 inline UChar32 getCodepoint() const; 148 149 /** 150 * Returns the end of the current code point range, if 151 * <tt>isString()</tt> returned false and <tt>nextRange()</tt> was 152 * called. Otherwise returns an undefined result. 153 * @stable ICU 2.4 154 */ 155 inline UChar32 getCodepointEnd() const; 156 157 /** 158 * Returns the current string, if <tt>isString()</tt> returned 159 * true. If the current iteration item is a code point, a UnicodeString 160 * containing that single code point is returned. 161 * 162 * Ownership of the returned string remains with the iterator. 163 * The string is guaranteed to remain valid only until the iterator is 164 * advanced to the next item, or until the iterator is deleted. 165 * 166 * @stable ICU 2.4 167 */ 168 const UnicodeString& getString(); 169 170 /** 171 * Advances the iteration position to the next element in the set, 172 * which can be either a single code point or a string. 173 * If there are no more elements in the set, return false. 174 * 175 * <p> 176 * If <tt>isString() == TRUE</tt>, the value is a 177 * string, otherwise the value is a 178 * single code point. Elements of either type can be retrieved 179 * with the function <tt>getString()</tt>, while elements of 180 * consisting of a single code point can be retrieved with 181 * <tt>getCodepoint()</tt> 182 * 183 * <p>The order of iteration is all code points in sorted order, 184 * followed by all strings sorted order. Do not mix 185 * calls to <tt>next()</tt> and <tt>nextRange()</tt> without 186 * calling <tt>reset()</tt> between them. The results of doing so 187 * are undefined. 188 * 189 * @return true if there was another element in the set. 190 * @stable ICU 2.4 191 */ 192 UBool next(); 193 194 /** 195 * Returns the next element in the set, either a code point range 196 * or a string. If there are no more elements in the set, return 197 * false. If <tt>isString() == TRUE</tt>, the value is a 198 * string and can be accessed with <tt>getString()</tt>. Otherwise the value is a 199 * range of one or more code points from <tt>getCodepoint()</tt> to 200 * <tt>getCodepointeEnd()</tt> inclusive. 201 * 202 * <p>The order of iteration is all code points ranges in sorted 203 * order, followed by all strings sorted order. Ranges are 204 * disjoint and non-contiguous. The value returned from <tt>getString()</tt> 205 * is undefined unless <tt>isString() == TRUE</tt>. Do not mix calls to 206 * <tt>next()</tt> and <tt>nextRange()</tt> without calling 207 * <tt>reset()</tt> between them. The results of doing so are 208 * undefined. 209 * 210 * @return true if there was another element in the set. 211 * @stable ICU 2.4 212 */ 213 UBool nextRange(); 214 215 /** 216 * Sets this iterator to visit the elements of the given set and 217 * resets it to the start of that set. The iterator is valid only 218 * so long as <tt>set</tt> is valid. 219 * @param set the set to iterate over. 220 * @stable ICU 2.4 221 */ 222 void reset(const UnicodeSet& set); 223 224 /** 225 * Resets this iterator to the start of the set. 226 * @stable ICU 2.4 227 */ 228 void reset(); 229 230 /** 231 * ICU "poor man's RTTI", returns a UClassID for this class. 232 * 233 * @stable ICU 2.4 234 */ 235 static UClassID U_EXPORT2 getStaticClassID(); 236 237 /** 238 * ICU "poor man's RTTI", returns a UClassID for the actual class. 239 * 240 * @stable ICU 2.4 241 */ 242 virtual UClassID getDynamicClassID() const; 243 244 // ======================= PRIVATES =========================== 245 246 protected: 247 248 // endElement and nextElements are really UChar32's, but we keep 249 // them as signed int32_t's so we can do comparisons with 250 // endElement set to -1. Leave them as int32_t's. 251 /** The set 252 * @stable ICU 2.4 253 */ 254 const UnicodeSet* set; 255 /** End range 256 * @stable ICU 2.4 257 */ 258 int32_t endRange; 259 /** Range 260 * @stable ICU 2.4 261 */ 262 int32_t range; 263 /** End element 264 * @stable ICU 2.4 265 */ 266 int32_t endElement; 267 /** Next element 268 * @stable ICU 2.4 269 */ 270 int32_t nextElement; 271 //UBool abbreviated; 272 /** Next string 273 * @stable ICU 2.4 274 */ 275 int32_t nextString; 276 /** String count 277 * @stable ICU 2.4 278 */ 279 int32_t stringCount; 280 281 /** 282 * Points to the string to use when the caller asks for a 283 * string and the current iteration item is a code point, not a string. 284 * @internal 285 */ 286 UnicodeString *cpString; 287 288 /** Copy constructor. Disallowed. 289 * @stable ICU 2.4 290 */ 291 UnicodeSetIterator(const UnicodeSetIterator&); // disallow 292 293 /** Assignment operator. Disallowed. 294 * @stable ICU 2.4 295 */ 296 UnicodeSetIterator& operator=(const UnicodeSetIterator&); // disallow 297 298 /** Load range 299 * @stable ICU 2.4 300 */ 301 virtual void loadRange(int32_t range); 302 303 }; 304 305 inline UBool UnicodeSetIterator::isString() const { 306 return codepoint == (UChar32)IS_STRING; 307 } 308 309 inline UChar32 UnicodeSetIterator::getCodepoint() const { 310 return codepoint; 311 } 312 313 inline UChar32 UnicodeSetIterator::getCodepointEnd() const { 314 return codepointEnd; 315 } 316 317 318 U_NAMESPACE_END 319 320 #endif 321