1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ********************************************************************** 5 * Copyright (c) 2003-2011, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 * Author: Alan Liu 9 * Created: September 23 2003 10 * Since: ICU 2.8 11 ********************************************************************** 12 */ 13 package com.ibm.icu.impl; 14 15 import java.text.ParsePosition; 16 17 import com.ibm.icu.text.SymbolTable; 18 import com.ibm.icu.text.UTF16; 19 20 /** 21 * An iterator that returns 32-bit code points. This class is deliberately 22 * <em>not</em> related to any of the JDK or ICU4J character iterator classes 23 * in order to minimize complexity. 24 * @author Alan Liu 25 * @since ICU 2.8 26 */ 27 public class RuleCharacterIterator { 28 29 // TODO: Ideas for later. (Do not implement if not needed, lest the 30 // code coverage numbers go down due to unused methods.) 31 // 1. Add a copy constructor, equals() method, clone() method. 32 // 2. Rather than return DONE, throw an exception if the end 33 // is reached -- this is an alternate usage model, probably not useful. 34 // 3. Return isEscaped from next(). If this happens, 35 // don't keep an isEscaped member variable. 36 37 /** 38 * Text being iterated. 39 */ 40 private String text; 41 42 /** 43 * Position of iterator. 44 */ 45 private ParsePosition pos; 46 47 /** 48 * Symbol table used to parse and dereference variables. May be null. 49 */ 50 private SymbolTable sym; 51 52 /** 53 * Current variable expansion, or null if none. 54 */ 55 private char[] buf; 56 57 /** 58 * Position within buf[]. Meaningless if buf == null. 59 */ 60 private int bufPos; 61 62 /** 63 * Flag indicating whether the last character was parsed from an escape. 64 */ 65 private boolean isEscaped; 66 67 /** 68 * Value returned when there are no more characters to iterate. 69 */ 70 public static final int DONE = -1; 71 72 /** 73 * Bitmask option to enable parsing of variable names. If (options & 74 * PARSE_VARIABLES) != 0, then an embedded variable will be expanded to 75 * its value. Variables are parsed using the SymbolTable API. 76 */ 77 public static final int PARSE_VARIABLES = 1; 78 79 /** 80 * Bitmask option to enable parsing of escape sequences. If (options & 81 * PARSE_ESCAPES) != 0, then an embedded escape sequence will be expanded 82 * to its value. Escapes are parsed using Utility.unescapeAt(). 83 */ 84 public static final int PARSE_ESCAPES = 2; 85 86 /** 87 * Bitmask option to enable skipping of whitespace. If (options & 88 * SKIP_WHITESPACE) != 0, then Unicode Pattern_White_Space characters will be silently 89 * skipped, as if they were not present in the input. 90 */ 91 public static final int SKIP_WHITESPACE = 4; 92 93 /** 94 * Constructs an iterator over the given text, starting at the given 95 * position. 96 * @param text the text to be iterated 97 * @param sym the symbol table, or null if there is none. If sym is null, 98 * then variables will not be deferenced, even if the PARSE_VARIABLES 99 * option is set. 100 * @param pos upon input, the index of the next character to return. If a 101 * variable has been dereferenced, then pos will <em>not</em> increment as 102 * characters of the variable value are iterated. 103 */ 104 public RuleCharacterIterator(String text, SymbolTable sym, 105 ParsePosition pos) { 106 if (text == null || pos.getIndex() > text.length()) { 107 throw new IllegalArgumentException(); 108 } 109 this.text = text; 110 this.sym = sym; 111 this.pos = pos; 112 buf = null; 113 } 114 115 /** 116 * Returns true if this iterator has no more characters to return. 117 */ 118 public boolean atEnd() { 119 return buf == null && pos.getIndex() == text.length(); 120 } 121 122 /** 123 * Returns the next character using the given options, or DONE if there 124 * are no more characters, and advance the position to the next 125 * character. 126 * @param options one or more of the following options, bitwise-OR-ed 127 * together: PARSE_VARIABLES, PARSE_ESCAPES, SKIP_WHITESPACE. 128 * @return the current 32-bit code point, or DONE 129 */ 130 public int next(int options) { 131 int c = DONE; 132 isEscaped = false; 133 134 for (;;) { 135 c = _current(); 136 _advance(UTF16.getCharCount(c)); 137 138 if (c == SymbolTable.SYMBOL_REF && buf == null && 139 (options & PARSE_VARIABLES) != 0 && sym != null) { 140 String name = sym.parseReference(text, pos, text.length()); 141 // If name == null there was an isolated SYMBOL_REF; 142 // return it. Caller must be prepared for this. 143 if (name == null) { 144 break; 145 } 146 bufPos = 0; 147 buf = sym.lookup(name); 148 if (buf == null) { 149 throw new IllegalArgumentException( 150 "Undefined variable: " + name); 151 } 152 // Handle empty variable value 153 if (buf.length == 0) { 154 buf = null; 155 } 156 continue; 157 } 158 159 if ((options & SKIP_WHITESPACE) != 0 && 160 PatternProps.isWhiteSpace(c)) { 161 continue; 162 } 163 164 if (c == '\\' && (options & PARSE_ESCAPES) != 0) { 165 int offset[] = new int[] { 0 }; 166 c = Utility.unescapeAt(lookahead(), offset); 167 jumpahead(offset[0]); 168 isEscaped = true; 169 if (c < 0) { 170 throw new IllegalArgumentException("Invalid escape"); 171 } 172 } 173 174 break; 175 } 176 177 return c; 178 } 179 180 /** 181 * Returns true if the last character returned by next() was 182 * escaped. This will only be the case if the option passed in to 183 * next() included PARSE_ESCAPED and the next character was an 184 * escape sequence. 185 */ 186 public boolean isEscaped() { 187 return isEscaped; 188 } 189 190 /** 191 * Returns true if this iterator is currently within a variable expansion. 192 */ 193 public boolean inVariable() { 194 return buf != null; 195 } 196 197 /** 198 * Returns an object which, when later passed to setPos(), will 199 * restore this iterator's position. Usage idiom: 200 * 201 * RuleCharacterIterator iterator = ...; 202 * Object pos = iterator.getPos(null); // allocate position object 203 * for (;;) { 204 * pos = iterator.getPos(pos); // reuse position object 205 * int c = iterator.next(...); 206 * ... 207 * } 208 * iterator.setPos(pos); 209 * 210 * @param p a position object previously returned by getPos(), 211 * or null. If not null, it will be updated and returned. If 212 * null, a new position object will be allocated and returned. 213 * @return a position object which may be passed to setPos(), 214 * either `p,' or if `p' == null, a newly-allocated object 215 */ 216 public Object getPos(Object p) { 217 if (p == null) { 218 return new Object[] {buf, new int[] {pos.getIndex(), bufPos}}; 219 } 220 Object[] a = (Object[]) p; 221 a[0] = buf; 222 int[] v = (int[]) a[1]; 223 v[0] = pos.getIndex(); 224 v[1] = bufPos; 225 return p; 226 } 227 228 /** 229 * Restores this iterator to the position it had when getPos() 230 * returned the given object. 231 * @param p a position object previously returned by getPos() 232 */ 233 public void setPos(Object p) { 234 Object[] a = (Object[]) p; 235 buf = (char[]) a[0]; 236 int[] v = (int[]) a[1]; 237 pos.setIndex(v[0]); 238 bufPos = v[1]; 239 } 240 241 /** 242 * Skips ahead past any ignored characters, as indicated by the given 243 * options. This is useful in conjunction with the lookahead() method. 244 * 245 * Currently, this only has an effect for SKIP_WHITESPACE. 246 * @param options one or more of the following options, bitwise-OR-ed 247 * together: PARSE_VARIABLES, PARSE_ESCAPES, SKIP_WHITESPACE. 248 */ 249 public void skipIgnored(int options) { 250 if ((options & SKIP_WHITESPACE) != 0) { 251 for (;;) { 252 int a = _current(); 253 if (!PatternProps.isWhiteSpace(a)) break; 254 _advance(UTF16.getCharCount(a)); 255 } 256 } 257 } 258 259 /** 260 * Returns a string containing the remainder of the characters to be 261 * returned by this iterator, without any option processing. If the 262 * iterator is currently within a variable expansion, this will only 263 * extend to the end of the variable expansion. This method is provided 264 * so that iterators may interoperate with string-based APIs. The typical 265 * sequence of calls is to call skipIgnored(), then call lookahead(), then 266 * parse the string returned by lookahead(), then call jumpahead() to 267 * resynchronize the iterator. 268 * @return a string containing the characters to be returned by future 269 * calls to next() 270 */ 271 public String lookahead() { 272 if (buf != null) { 273 return new String(buf, bufPos, buf.length - bufPos); 274 } else { 275 return text.substring(pos.getIndex()); 276 } 277 } 278 279 /** 280 * Advances the position by the given number of 16-bit code units. 281 * This is useful in conjunction with the lookahead() method. 282 * @param count the number of 16-bit code units to jump over 283 */ 284 public void jumpahead(int count) { 285 if (count < 0) { 286 throw new IllegalArgumentException(); 287 } 288 if (buf != null) { 289 bufPos += count; 290 if (bufPos > buf.length) { 291 throw new IllegalArgumentException(); 292 } 293 if (bufPos == buf.length) { 294 buf = null; 295 } 296 } else { 297 int i = pos.getIndex() + count; 298 pos.setIndex(i); 299 if (i > text.length()) { 300 throw new IllegalArgumentException(); 301 } 302 } 303 } 304 305 /** 306 * Returns a string representation of this object, consisting of the 307 * characters being iterated, with a '|' marking the current position. 308 * Position within an expanded variable is <em>not</em> indicated. 309 * @return a string representation of this object 310 */ 311 @Override 312 public String toString() { 313 int b = pos.getIndex(); 314 return text.substring(0, b) + '|' + text.substring(b); 315 } 316 317 /** 318 * Returns the current 32-bit code point without parsing escapes, parsing 319 * variables, or skipping whitespace. 320 * @return the current 32-bit code point 321 */ 322 private int _current() { 323 if (buf != null) { 324 return UTF16.charAt(buf, 0, buf.length, bufPos); 325 } else { 326 int i = pos.getIndex(); 327 return (i < text.length()) ? UTF16.charAt(text, i) : DONE; 328 } 329 } 330 331 /** 332 * Advances the position by the given amount. 333 * @param count the number of 16-bit code units to advance past 334 */ 335 private void _advance(int count) { 336 if (buf != null) { 337 bufPos += count; 338 if (bufPos == buf.length) { 339 buf = null; 340 } 341 } else { 342 pos.setIndex(pos.getIndex() + count); 343 if (pos.getIndex() > text.length()) { 344 pos.setIndex(text.length()); 345 } 346 } 347 } 348 }