Home | History | Annotate | Download | only in impl
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4 **********************************************************************
      5 * Copyright (c) 2003-2011, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 * Author: Alan Liu
      9 * Created: September 23 2003
     10 * Since: ICU 2.8
     11 **********************************************************************
     12 */
     13 package com.ibm.icu.impl;
     14 
     15 import java.text.ParsePosition;
     16 
     17 import com.ibm.icu.text.SymbolTable;
     18 import com.ibm.icu.text.UTF16;
     19 
     20 /**
     21  * An iterator that returns 32-bit code points.  This class is deliberately
     22  * <em>not</em> related to any of the JDK or ICU4J character iterator classes
     23  * in order to minimize complexity.
     24  * @author Alan Liu
     25  * @since ICU 2.8
     26  */
     27 public class RuleCharacterIterator {
     28 
     29     // TODO: Ideas for later.  (Do not implement if not needed, lest the
     30     // code coverage numbers go down due to unused methods.)
     31     // 1. Add a copy constructor, equals() method, clone() method.
     32     // 2. Rather than return DONE, throw an exception if the end
     33     // is reached -- this is an alternate usage model, probably not useful.
     34     // 3. Return isEscaped from next().  If this happens,
     35     // don't keep an isEscaped member variable.
     36 
     37     /**
     38      * Text being iterated.
     39      */
     40     private String text;
     41 
     42     /**
     43      * Position of iterator.
     44      */
     45     private ParsePosition pos;
     46 
     47     /**
     48      * Symbol table used to parse and dereference variables.  May be null.
     49      */
     50     private SymbolTable sym;
     51 
     52     /**
     53      * Current variable expansion, or null if none.
     54      */
     55     private char[] buf;
     56 
     57     /**
     58      * Position within buf[].  Meaningless if buf == null.
     59      */
     60     private int bufPos;
     61 
     62     /**
     63      * Flag indicating whether the last character was parsed from an escape.
     64      */
     65     private boolean isEscaped;
     66 
     67     /**
     68      * Value returned when there are no more characters to iterate.
     69      */
     70     public static final int DONE = -1;
     71 
     72     /**
     73      * Bitmask option to enable parsing of variable names.  If (options &
     74      * PARSE_VARIABLES) != 0, then an embedded variable will be expanded to
     75      * its value.  Variables are parsed using the SymbolTable API.
     76      */
     77     public static final int PARSE_VARIABLES = 1;
     78 
     79     /**
     80      * Bitmask option to enable parsing of escape sequences.  If (options &
     81      * PARSE_ESCAPES) != 0, then an embedded escape sequence will be expanded
     82      * to its value.  Escapes are parsed using Utility.unescapeAt().
     83      */
     84     public static final int PARSE_ESCAPES   = 2;
     85 
     86     /**
     87      * Bitmask option to enable skipping of whitespace.  If (options &
     88      * SKIP_WHITESPACE) != 0, then Unicode Pattern_White_Space characters will be silently
     89      * skipped, as if they were not present in the input.
     90      */
     91     public static final int SKIP_WHITESPACE = 4;
     92 
     93     /**
     94      * Constructs an iterator over the given text, starting at the given
     95      * position.
     96      * @param text the text to be iterated
     97      * @param sym the symbol table, or null if there is none.  If sym is null,
     98      * then variables will not be deferenced, even if the PARSE_VARIABLES
     99      * option is set.
    100      * @param pos upon input, the index of the next character to return.  If a
    101      * variable has been dereferenced, then pos will <em>not</em> increment as
    102      * characters of the variable value are iterated.
    103      */
    104     public RuleCharacterIterator(String text, SymbolTable sym,
    105                                  ParsePosition pos) {
    106         if (text == null || pos.getIndex() > text.length()) {
    107             throw new IllegalArgumentException();
    108         }
    109         this.text = text;
    110         this.sym = sym;
    111         this.pos = pos;
    112         buf = null;
    113     }
    114 
    115     /**
    116      * Returns true if this iterator has no more characters to return.
    117      */
    118     public boolean atEnd() {
    119         return buf == null && pos.getIndex() == text.length();
    120     }
    121 
    122     /**
    123      * Returns the next character using the given options, or DONE if there
    124      * are no more characters, and advance the position to the next
    125      * character.
    126      * @param options one or more of the following options, bitwise-OR-ed
    127      * together: PARSE_VARIABLES, PARSE_ESCAPES, SKIP_WHITESPACE.
    128      * @return the current 32-bit code point, or DONE
    129      */
    130     public int next(int options) {
    131         int c = DONE;
    132         isEscaped = false;
    133 
    134         for (;;) {
    135             c = _current();
    136             _advance(UTF16.getCharCount(c));
    137 
    138             if (c == SymbolTable.SYMBOL_REF && buf == null &&
    139                 (options & PARSE_VARIABLES) != 0 && sym != null) {
    140                 String name = sym.parseReference(text, pos, text.length());
    141                 // If name == null there was an isolated SYMBOL_REF;
    142                 // return it.  Caller must be prepared for this.
    143                 if (name == null) {
    144                     break;
    145                 }
    146                 bufPos = 0;
    147                 buf = sym.lookup(name);
    148                 if (buf == null) {
    149                     throw new IllegalArgumentException(
    150                                 "Undefined variable: " + name);
    151                 }
    152                 // Handle empty variable value
    153                 if (buf.length == 0) {
    154                     buf = null;
    155                 }
    156                 continue;
    157             }
    158 
    159             if ((options & SKIP_WHITESPACE) != 0 &&
    160                 PatternProps.isWhiteSpace(c)) {
    161                 continue;
    162             }
    163 
    164             if (c == '\\' && (options & PARSE_ESCAPES) != 0) {
    165                 int offset[] = new int[] { 0 };
    166                 c = Utility.unescapeAt(lookahead(), offset);
    167                 jumpahead(offset[0]);
    168                 isEscaped = true;
    169                 if (c < 0) {
    170                     throw new IllegalArgumentException("Invalid escape");
    171                 }
    172             }
    173 
    174             break;
    175         }
    176 
    177         return c;
    178     }
    179 
    180     /**
    181      * Returns true if the last character returned by next() was
    182      * escaped.  This will only be the case if the option passed in to
    183      * next() included PARSE_ESCAPED and the next character was an
    184      * escape sequence.
    185      */
    186     public boolean isEscaped() {
    187         return isEscaped;
    188     }
    189 
    190     /**
    191      * Returns true if this iterator is currently within a variable expansion.
    192      */
    193     public boolean inVariable() {
    194         return buf != null;
    195     }
    196 
    197     /**
    198      * Returns an object which, when later passed to setPos(), will
    199      * restore this iterator's position.  Usage idiom:
    200      *
    201      * RuleCharacterIterator iterator = ...;
    202      * Object pos = iterator.getPos(null); // allocate position object
    203      * for (;;) {
    204      *   pos = iterator.getPos(pos); // reuse position object
    205      *   int c = iterator.next(...);
    206      *   ...
    207      * }
    208      * iterator.setPos(pos);
    209      *
    210      * @param p a position object previously returned by getPos(),
    211      * or null.  If not null, it will be updated and returned.  If
    212      * null, a new position object will be allocated and returned.
    213      * @return a position object which may be passed to setPos(),
    214      * either `p,' or if `p' == null, a newly-allocated object
    215      */
    216     public Object getPos(Object p) {
    217         if (p == null) {
    218             return new Object[] {buf, new int[] {pos.getIndex(), bufPos}};
    219         }
    220         Object[] a = (Object[]) p;
    221         a[0] = buf;
    222         int[] v = (int[]) a[1];
    223         v[0] = pos.getIndex();
    224         v[1] = bufPos;
    225         return p;
    226     }
    227 
    228     /**
    229      * Restores this iterator to the position it had when getPos()
    230      * returned the given object.
    231      * @param p a position object previously returned by getPos()
    232      */
    233     public void setPos(Object p) {
    234         Object[] a = (Object[]) p;
    235         buf = (char[]) a[0];
    236         int[] v = (int[]) a[1];
    237         pos.setIndex(v[0]);
    238         bufPos = v[1];
    239     }
    240 
    241     /**
    242      * Skips ahead past any ignored characters, as indicated by the given
    243      * options.  This is useful in conjunction with the lookahead() method.
    244      *
    245      * Currently, this only has an effect for SKIP_WHITESPACE.
    246      * @param options one or more of the following options, bitwise-OR-ed
    247      * together: PARSE_VARIABLES, PARSE_ESCAPES, SKIP_WHITESPACE.
    248      */
    249     public void skipIgnored(int options) {
    250         if ((options & SKIP_WHITESPACE) != 0) {
    251             for (;;) {
    252                 int a = _current();
    253                 if (!PatternProps.isWhiteSpace(a)) break;
    254                 _advance(UTF16.getCharCount(a));
    255             }
    256         }
    257     }
    258 
    259     /**
    260      * Returns a string containing the remainder of the characters to be
    261      * returned by this iterator, without any option processing.  If the
    262      * iterator is currently within a variable expansion, this will only
    263      * extend to the end of the variable expansion.  This method is provided
    264      * so that iterators may interoperate with string-based APIs.  The typical
    265      * sequence of calls is to call skipIgnored(), then call lookahead(), then
    266      * parse the string returned by lookahead(), then call jumpahead() to
    267      * resynchronize the iterator.
    268      * @return a string containing the characters to be returned by future
    269      * calls to next()
    270      */
    271     public String lookahead() {
    272         if (buf != null) {
    273             return new String(buf, bufPos, buf.length - bufPos);
    274         } else {
    275             return text.substring(pos.getIndex());
    276         }
    277     }
    278 
    279     /**
    280      * Advances the position by the given number of 16-bit code units.
    281      * This is useful in conjunction with the lookahead() method.
    282      * @param count the number of 16-bit code units to jump over
    283      */
    284     public void jumpahead(int count) {
    285         if (count < 0) {
    286             throw new IllegalArgumentException();
    287         }
    288         if (buf != null) {
    289             bufPos += count;
    290             if (bufPos > buf.length) {
    291                 throw new IllegalArgumentException();
    292             }
    293             if (bufPos == buf.length) {
    294                 buf = null;
    295             }
    296         } else {
    297             int i = pos.getIndex() + count;
    298             pos.setIndex(i);
    299             if (i > text.length()) {
    300                 throw new IllegalArgumentException();
    301             }
    302         }
    303     }
    304 
    305     /**
    306      * Returns a string representation of this object, consisting of the
    307      * characters being iterated, with a '|' marking the current position.
    308      * Position within an expanded variable is <em>not</em> indicated.
    309      * @return a string representation of this object
    310      */
    311     @Override
    312     public String toString() {
    313         int b = pos.getIndex();
    314         return text.substring(0, b) + '|' + text.substring(b);
    315     }
    316 
    317     /**
    318      * Returns the current 32-bit code point without parsing escapes, parsing
    319      * variables, or skipping whitespace.
    320      * @return the current 32-bit code point
    321      */
    322     private int _current() {
    323         if (buf != null) {
    324             return UTF16.charAt(buf, 0, buf.length, bufPos);
    325         } else {
    326             int i = pos.getIndex();
    327             return (i < text.length()) ? UTF16.charAt(text, i) : DONE;
    328         }
    329     }
    330 
    331     /**
    332      * Advances the position by the given amount.
    333      * @param count the number of 16-bit code units to advance past
    334      */
    335     private void _advance(int count) {
    336         if (buf != null) {
    337             bufPos += count;
    338             if (bufPos == buf.length) {
    339                 buf = null;
    340             }
    341         } else {
    342             pos.setIndex(pos.getIndex() + count);
    343             if (pos.getIndex() > text.length()) {
    344                 pos.setIndex(text.length());
    345             }
    346         }
    347     }
    348 }