1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 1996-2012, International Business Machines Corporation and * 6 * others. All Rights Reserved. * 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.dev.util; 10 11 import java.util.Iterator; 12 import java.util.Set; 13 14 import com.ibm.icu.text.UTF16; 15 16 /** 17 * UnicodeSetIterator iterates over the contents of a UnicodeSet. It 18 * iterates over either code points or code point ranges. After all 19 * code points or ranges have been returned, it returns the 20 * multicharacter strings of the UnicodSet, if any. 21 * 22 * <p>To iterate over code points, use a loop like this: 23 * <pre> 24 * UnicodeSetIterator it = new UnicodeSetIterator(set); 25 * while (set.next()) { 26 * if (set.codepoint != UnicodeSetIterator.IS_STRING) { 27 * processCodepoint(set.codepoint); 28 * } else { 29 * processString(set.string); 30 * } 31 * } 32 * </pre> 33 * 34 * <p>To iterate over code point ranges, use a loop like this: 35 * <pre> 36 * UnicodeSetIterator it = new UnicodeSetIterator(set); 37 * while (set.nextRange()) { 38 * if (set.codepoint != UnicodeSetIterator.IS_STRING) { 39 * processCodepointRange(set.codepoint, set.codepointEnd); 40 * } else { 41 * processString(set.string); 42 * } 43 * } 44 * </pre> 45 * @author M. Davis 46 * 47 * @internal CLDR 48 */ 49 public class UnicodeMapIterator<T> { 50 51 /** 52 * Value of <tt>codepoint</tt> if the iterator points to a string. 53 * If <tt>codepoint == IS_STRING</tt>, then examine 54 * <tt>string</tt> for the current iteration result. 55 */ 56 public static int IS_STRING = -1; 57 58 /** 59 * Current code point, or the special value <tt>IS_STRING</tt>, if 60 * the iterator points to a string. 61 */ 62 public int codepoint; 63 64 /** 65 * When iterating over ranges using <tt>nextRange()</tt>, 66 * <tt>codepointEnd</tt> contains the inclusive end of the 67 * iteration range, if <tt>codepoint != IS_STRING</tt>. If 68 * iterating over code points using <tt>next()</tt>, or if 69 * <tt>codepoint == IS_STRING</tt>, then the value of 70 * <tt>codepointEnd</tt> is undefined. 71 */ 72 public int codepointEnd; 73 74 /** 75 * If <tt>codepoint == IS_STRING</tt>, then <tt>string</tt> points 76 * to the current string. If <tt>codepoint != IS_STRING</tt>, the 77 * value of <tt>string</tt> is undefined. 78 */ 79 public String string; 80 81 /** 82 * The value associated with this element or range. 83 */ 84 public T value; 85 86 /** 87 * Create an iterator over the given set. 88 * @param set set to iterate over 89 */ 90 public UnicodeMapIterator(UnicodeMap set) { 91 reset(set); 92 } 93 94 /** 95 * Create an iterator over nothing. <tt>next()</tt> and 96 * <tt>nextRange()</tt> return false. This is a convenience 97 * constructor allowing the target to be set later. 98 */ 99 public UnicodeMapIterator() { 100 reset(new UnicodeMap()); 101 } 102 103 /** 104 * Returns the next element in the set, either a single code point 105 * or a string. If there are no more elements in the set, return 106 * false. If <tt>codepoint == IS_STRING</tt>, the value is a 107 * string in the <tt>string</tt> field. Otherwise the value is a 108 * single code point in the <tt>codepoint</tt> field. 109 * 110 * <p>The order of iteration is all code points in sorted order, 111 * followed by all strings sorted order. <tt>codepointEnd</tt> is 112 * undefined after calling this method. <tt>string</tt> is 113 * undefined unless <tt>codepoint == IS_STRING</tt>. Do not mix 114 * calls to <tt>next()</tt> and <tt>nextRange()</tt> without 115 * calling <tt>reset()</tt> between them. The results of doing so 116 * are undefined. 117 * 118 * @return true if there was another element in the set and this 119 * object contains the element. 120 */ 121 public boolean next() { 122 if (nextElement <= endElement) { 123 codepoint = codepointEnd = nextElement++; 124 return true; 125 } 126 while (range < endRange) { 127 if (loadRange(++range) == null) { 128 continue; 129 } 130 codepoint = codepointEnd = nextElement++; 131 return true; 132 } 133 134 // stringIterator == null iff there are no string elements remaining 135 136 if (stringIterator == null) return false; 137 codepoint = IS_STRING; // signal that value is actually a string 138 string = (String)stringIterator.next(); 139 if (!stringIterator.hasNext()) stringIterator = null; 140 return true; 141 } 142 143 /** 144 * Returns the next element in the set, either a code point range 145 * or a string. If there are no more elements in the set, return 146 * false. If <tt>codepoint == IS_STRING</tt>, the value is a 147 * string in the <tt>string</tt> field. Otherwise the value is a 148 * range of one or more code points from <tt>codepoint</tt> to 149 * <tt>codepointeEnd</tt> inclusive. 150 * 151 * <p>The order of iteration is all code points ranges in sorted 152 * order, followed by all strings sorted order. Ranges are 153 * disjoint and non-contiguous. <tt>string</tt> is undefined 154 * unless <tt>codepoint == IS_STRING</tt>. Do not mix calls to 155 * <tt>next()</tt> and <tt>nextRange()</tt> without calling 156 * <tt>reset()</tt> between them. The results of doing so are 157 * undefined. 158 * 159 * @return true if there was another element in the set and this 160 * object contains the element. 161 */ 162 public boolean nextRange() { 163 if (nextElement <= endElement) { 164 codepointEnd = endElement; 165 codepoint = nextElement; 166 nextElement = endElement+1; 167 return true; 168 } 169 while (range < endRange) { 170 if (loadRange(++range) == null) { 171 continue; 172 } 173 codepointEnd = endElement; 174 codepoint = nextElement; 175 nextElement = endElement+1; 176 return true; 177 } 178 179 // stringIterator == null iff there are no string elements remaining 180 181 if (stringIterator == null) return false; 182 codepoint = IS_STRING; // signal that value is actually a string 183 string = (String)stringIterator.next(); 184 if (!stringIterator.hasNext()) stringIterator = null; 185 return true; 186 } 187 188 /** 189 * Sets this iterator to visit the elements of the given set and 190 * resets it to the start of that set. The iterator is valid only 191 * so long as <tt>set</tt> is valid. 192 * @param set the set to iterate over. 193 */ 194 public void reset(UnicodeMap set) { 195 this.map = set; 196 reset(); 197 } 198 199 /** 200 * Resets this iterator to the start of the set. 201 * @return 202 */ 203 public UnicodeMapIterator<T> reset() { 204 endRange = map.getRangeCount() - 1; 205 // both next*() methods will test: if (nextElement <= endElement) 206 // we set them to fail this test, which will cause them to load the first range 207 nextElement = 0; 208 endElement = -1; 209 range = -1; 210 211 stringIterator = null; 212 Set<String> strings = map.getNonRangeStrings(); 213 if (strings != null) { 214 stringIterator = strings.iterator(); 215 if (!stringIterator.hasNext()) stringIterator = null; 216 } 217 value = null; 218 return this; 219 } 220 221 /** 222 * Gets the current string from the iterator. Only use after calling next(), not nextRange(). 223 */ 224 public String getString() { 225 if (codepoint != IS_STRING) { 226 return UTF16.valueOf(codepoint); 227 } 228 return string; 229 } 230 231 // ======================= PRIVATES =========================== 232 233 private UnicodeMap<T> map; 234 private int endRange = 0; 235 private int range = 0; 236 private Iterator<String> stringIterator = null; 237 protected int endElement; 238 protected int nextElement; 239 240 /* 241 * Invariant: stringIterator is null when there are no (more) strings remaining 242 */ 243 244 protected T loadRange(int range) { 245 nextElement = map.getRangeStart(range); 246 endElement = map.getRangeEnd(range); 247 value = map.getRangeValue(range); 248 return value; 249 } 250 } 251