1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 1996-2016, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.text; 10 11 import java.text.CharacterIterator; 12 13 import com.ibm.icu.impl.CharacterIteratorWrapper; 14 import com.ibm.icu.impl.ReplaceableUCharacterIterator; 15 import com.ibm.icu.impl.UCharArrayIterator; 16 import com.ibm.icu.impl.UCharacterIteratorWrapper; 17 18 /** 19 * Abstract class that defines an API for iteration on text objects.This is an interface for forward and backward 20 * iteration and random access into a text object. Forward iteration is done with post-increment and backward iteration 21 * is done with pre-decrement semantics, while the <code>java.text.CharacterIterator</code> interface methods provided 22 * forward iteration with "pre-increment" and backward iteration with pre-decrement semantics. This API is more 23 * efficient for forward iteration over code points. The other major difference is that this API can do both code unit 24 * and code point iteration, <code>java.text.CharacterIterator</code> can only iterate over code units and is limited to 25 * BMP (0 - 0xFFFF) 26 * 27 * @author Ram 28 * @stable ICU 2.4 29 */ 30 public abstract class UCharacterIterator implements Cloneable, UForwardCharacterIterator { 31 32 /** 33 * Protected default constructor for the subclasses 34 * 35 * @stable ICU 2.4 36 */ 37 protected UCharacterIterator() { 38 } 39 40 // static final methods ---------------------------------------------------- 41 42 /** 43 * Returns a <code>UCharacterIterator</code> object given a <code>Replaceable</code> object. 44 * 45 * @param source 46 * a valid source as a <code>Replaceable</code> object 47 * @return UCharacterIterator object 48 * @exception IllegalArgumentException 49 * if the argument is null 50 * @stable ICU 2.4 51 */ 52 public static final UCharacterIterator getInstance(Replaceable source) { 53 return new ReplaceableUCharacterIterator(source); 54 } 55 56 /** 57 * Returns a <code>UCharacterIterator</code> object given a source string. 58 * 59 * @param source 60 * a string 61 * @return UCharacterIterator object 62 * @exception IllegalArgumentException 63 * if the argument is null 64 * @stable ICU 2.4 65 */ 66 public static final UCharacterIterator getInstance(String source) { 67 return new ReplaceableUCharacterIterator(source); 68 } 69 70 /** 71 * Returns a <code>UCharacterIterator</code> object given a source character array. 72 * 73 * @param source 74 * an array of UTF-16 code units 75 * @return UCharacterIterator object 76 * @exception IllegalArgumentException 77 * if the argument is null 78 * @stable ICU 2.4 79 */ 80 public static final UCharacterIterator getInstance(char[] source) { 81 return getInstance(source, 0, source.length); 82 } 83 84 /** 85 * Returns a <code>UCharacterIterator</code> object given a source character array. 86 * 87 * @param source 88 * an array of UTF-16 code units 89 * @return UCharacterIterator object 90 * @exception IllegalArgumentException 91 * if the argument is null 92 * @stable ICU 2.4 93 */ 94 public static final UCharacterIterator getInstance(char[] source, int start, int limit) { 95 return new UCharArrayIterator(source, start, limit); 96 } 97 98 /** 99 * Returns a <code>UCharacterIterator</code> object given a source StringBuffer. 100 * 101 * @param source 102 * an string buffer of UTF-16 code units 103 * @return UCharacterIterator object 104 * @exception IllegalArgumentException 105 * if the argument is null 106 * @stable ICU 2.4 107 */ 108 public static final UCharacterIterator getInstance(StringBuffer source) { 109 return new ReplaceableUCharacterIterator(source); 110 } 111 112 /** 113 * Returns a <code>UCharacterIterator</code> object given a CharacterIterator. 114 * 115 * @param source 116 * a valid CharacterIterator object. 117 * @return UCharacterIterator object 118 * @exception IllegalArgumentException 119 * if the argument is null 120 * @stable ICU 2.4 121 */ 122 public static final UCharacterIterator getInstance(CharacterIterator source) { 123 return new CharacterIteratorWrapper(source); 124 } 125 126 // public methods ---------------------------------------------------------- 127 /** 128 * Returns a <code>java.text.CharacterIterator</code> object for the underlying text of this iterator. The returned 129 * iterator is independent of this iterator. 130 * 131 * @return java.text.CharacterIterator object 132 * @stable ICU 2.4 133 */ 134 public CharacterIterator getCharacterIterator() { 135 return new UCharacterIteratorWrapper(this); 136 } 137 138 /** 139 * Returns the code unit at the current index. If index is out of range, returns DONE. Index is not changed. 140 * 141 * @return current code unit 142 * @stable ICU 2.4 143 */ 144 public abstract int current(); 145 146 /** 147 * Returns the codepoint at the current index. If the current index is invalid, DONE is returned. If the current 148 * index points to a lead surrogate, and there is a following trail surrogate, then the code point is returned. 149 * Otherwise, the code unit at index is returned. Index is not changed. 150 * 151 * @return current codepoint 152 * @stable ICU 2.4 153 */ 154 public int currentCodePoint() { 155 int ch = current(); 156 if (UTF16.isLeadSurrogate((char) ch)) { 157 // advance the index to get the 158 // next code point 159 next(); 160 // due to post increment semantics 161 // current() after next() actually 162 // returns the char we want 163 int ch2 = current(); 164 // current should never change 165 // the current index so back off 166 previous(); 167 168 if (UTF16.isTrailSurrogate((char) ch2)) { 169 // we found a surrogate pair 170 // return the codepoint 171 return Character.toCodePoint((char) ch, (char) ch2); 172 } 173 } 174 return ch; 175 } 176 177 /** 178 * Returns the length of the text 179 * 180 * @return length of the text 181 * @stable ICU 2.4 182 */ 183 public abstract int getLength(); 184 185 /** 186 * Gets the current index in text. 187 * 188 * @return current index in text. 189 * @stable ICU 2.4 190 */ 191 public abstract int getIndex(); 192 193 /** 194 * Returns the UTF16 code unit at index, and increments to the next code unit (post-increment semantics). If index 195 * is out of range, DONE is returned, and the iterator is reset to the limit of the text. 196 * 197 * @return the next UTF16 code unit, or DONE if the index is at the limit of the text. 198 * @stable ICU 2.4 199 */ 200 @Override 201 public abstract int next(); 202 203 /** 204 * Returns the code point at index, and increments to the next code point (post-increment semantics). If index does 205 * not point to a valid surrogate pair, the behavior is the same as <code>next()</code>. Otherwise the iterator is 206 * incremented past the surrogate pair, and the code point represented by the pair is returned. 207 * 208 * @return the next codepoint in text, or DONE if the index is at the limit of the text. 209 * @stable ICU 2.4 210 */ 211 @Override 212 public int nextCodePoint() { 213 int ch1 = next(); 214 if (UTF16.isLeadSurrogate((char) ch1)) { 215 int ch2 = next(); 216 if (UTF16.isTrailSurrogate((char) ch2)) { 217 return Character.toCodePoint((char) ch1, (char) ch2); 218 } else if (ch2 != DONE) { 219 // unmatched surrogate so back out 220 previous(); 221 } 222 } 223 return ch1; 224 } 225 226 /** 227 * Decrement to the position of the previous code unit in the text, and return it (pre-decrement semantics). If the 228 * resulting index is less than 0, the index is reset to 0 and DONE is returned. 229 * 230 * @return the previous code unit in the text, or DONE if the new index is before the start of the text. 231 * @stable ICU 2.4 232 */ 233 public abstract int previous(); 234 235 /** 236 * Retreat to the start of the previous code point in the text, and return it (pre-decrement semantics). If the 237 * index is not preceeded by a valid surrogate pair, the behavior is the same as <code>previous()</code>. Otherwise 238 * the iterator is decremented to the start of the surrogate pair, and the code point represented by the pair is 239 * returned. 240 * 241 * @return the previous code point in the text, or DONE if the new index is before the start of the text. 242 * @stable ICU 2.4 243 */ 244 public int previousCodePoint() { 245 int ch1 = previous(); 246 if (UTF16.isTrailSurrogate((char) ch1)) { 247 int ch2 = previous(); 248 if (UTF16.isLeadSurrogate((char) ch2)) { 249 return Character.toCodePoint((char) ch2, (char) ch1); 250 } else if (ch2 != DONE) { 251 // unmatched trail surrogate so back out 252 next(); 253 } 254 } 255 return ch1; 256 } 257 258 /** 259 * Sets the index to the specified index in the text. 260 * 261 * @param index 262 * the index within the text. 263 * @exception IndexOutOfBoundsException 264 * is thrown if an invalid index is supplied 265 * @stable ICU 2.4 266 */ 267 public abstract void setIndex(int index); 268 269 /** 270 * Sets the current index to the limit. 271 * 272 * @stable ICU 2.4 273 */ 274 public void setToLimit() { 275 setIndex(getLength()); 276 } 277 278 /** 279 * Sets the current index to the start. 280 * 281 * @stable ICU 2.4 282 */ 283 public void setToStart() { 284 setIndex(0); 285 } 286 287 /** 288 * Fills the buffer with the underlying text storage of the iterator If the buffer capacity is not enough a 289 * exception is thrown. The capacity of the fill in buffer should at least be equal to length of text in the 290 * iterator obtained by calling <code>getLength()</code>). <b>Usage:</b> 291 * 292 * <pre> 293 * UChacterIterator iter = new UCharacterIterator.getInstance(text); 294 * char[] buf = new char[iter.getLength()]; 295 * iter.getText(buf); 296 * 297 * OR 298 * char[] buf= new char[1]; 299 * int len = 0; 300 * for(;;){ 301 * try{ 302 * len = iter.getText(buf); 303 * break; 304 * }catch(IndexOutOfBoundsException e){ 305 * buf = new char[iter.getLength()]; 306 * } 307 * } 308 * </pre> 309 * 310 * @param fillIn 311 * an array of chars to fill with the underlying UTF-16 code units. 312 * @param offset 313 * the position within the array to start putting the data. 314 * @return the number of code units added to fillIn, as a convenience 315 * @exception IndexOutOfBoundsException 316 * exception if there is not enough room after offset in the array, or if offset < 0. 317 * @stable ICU 2.4 318 */ 319 public abstract int getText(char[] fillIn, int offset); 320 321 /** 322 * Convenience override for <code>getText(char[], int)</code> that provides an offset of 0. 323 * 324 * @param fillIn 325 * an array of chars to fill with the underlying UTF-16 code units. 326 * @return the number of code units added to fillIn, as a convenience 327 * @exception IndexOutOfBoundsException 328 * exception if there is not enough room in the array. 329 * @stable ICU 2.4 330 */ 331 public final int getText(char[] fillIn) { 332 return getText(fillIn, 0); 333 } 334 335 /** 336 * Convenience method for returning the underlying text storage as as string 337 * 338 * @return the underlying text storage in the iterator as a string 339 * @stable ICU 2.4 340 */ 341 public String getText() { 342 char[] text = new char[getLength()]; 343 getText(text); 344 return new String(text); 345 } 346 347 /** 348 * Moves the current position by the number of code units specified, either forward or backward depending on the 349 * sign of delta (positive or negative respectively). If the resulting index would be less than zero, the index is 350 * set to zero, and if the resulting index would be greater than limit, the index is set to limit. 351 * 352 * @param delta 353 * the number of code units to move the current index. 354 * @return the new index. 355 * @exception IndexOutOfBoundsException 356 * is thrown if an invalid index is supplied 357 * @stable ICU 2.4 358 * 359 */ 360 public int moveIndex(int delta) { 361 int x = Math.max(0, Math.min(getIndex() + delta, getLength())); 362 setIndex(x); 363 return x; 364 } 365 366 /** 367 * Moves the current position by the number of code points specified, either forward or backward depending on the 368 * sign of delta (positive or negative respectively). If the current index is at a trail surrogate then the first 369 * adjustment is by code unit, and the remaining adjustments are by code points. If the resulting index would be 370 * less than zero, the index is set to zero, and if the resulting index would be greater than limit, the index is 371 * set to limit. 372 * 373 * @param delta 374 * the number of code units to move the current index. 375 * @return the new index 376 * @exception IndexOutOfBoundsException 377 * is thrown if an invalid delta is supplied 378 * @stable ICU 2.4 379 */ 380 public int moveCodePointIndex(int delta) { 381 if (delta > 0) { 382 while (delta > 0 && nextCodePoint() != DONE) { 383 delta--; 384 } 385 } else { 386 while (delta < 0 && previousCodePoint() != DONE) { 387 delta++; 388 } 389 } 390 if (delta != 0) { 391 throw new IndexOutOfBoundsException(); 392 } 393 394 return getIndex(); 395 } 396 397 /** 398 * Creates a copy of this iterator, independent from other iterators. If it is not possible to clone the iterator, 399 * returns null. 400 * 401 * @return copy of this iterator 402 * @stable ICU 2.4 403 */ 404 @Override 405 public Object clone() throws CloneNotSupportedException { 406 return super.clone(); 407 } 408 409 } 410