Home | History | Annotate | Download | only in text
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4  *******************************************************************************
      5  * Copyright (C) 1996-2016, International Business Machines Corporation and
      6  * others. All Rights Reserved.
      7  *******************************************************************************
      8  */
      9 package com.ibm.icu.text;
     10 
     11 import java.text.CharacterIterator;
     12 
     13 import com.ibm.icu.impl.CharacterIteratorWrapper;
     14 import com.ibm.icu.impl.ReplaceableUCharacterIterator;
     15 import com.ibm.icu.impl.UCharArrayIterator;
     16 import com.ibm.icu.impl.UCharacterIteratorWrapper;
     17 
     18 /**
     19  * Abstract class that defines an API for iteration on text objects.This is an interface for forward and backward
     20  * iteration and random access into a text object. Forward iteration is done with post-increment and backward iteration
     21  * is done with pre-decrement semantics, while the <code>java.text.CharacterIterator</code> interface methods provided
     22  * forward iteration with "pre-increment" and backward iteration with pre-decrement semantics. This API is more
     23  * efficient for forward iteration over code points. The other major difference is that this API can do both code unit
     24  * and code point iteration, <code>java.text.CharacterIterator</code> can only iterate over code units and is limited to
     25  * BMP (0 - 0xFFFF)
     26  *
     27  * @author Ram
     28  * @stable ICU 2.4
     29  */
     30 public abstract class UCharacterIterator implements Cloneable, UForwardCharacterIterator {
     31 
     32     /**
     33      * Protected default constructor for the subclasses
     34      *
     35      * @stable ICU 2.4
     36      */
     37     protected UCharacterIterator() {
     38     }
     39 
     40     // static final methods ----------------------------------------------------
     41 
     42     /**
     43      * Returns a <code>UCharacterIterator</code> object given a <code>Replaceable</code> object.
     44      *
     45      * @param source
     46      *            a valid source as a <code>Replaceable</code> object
     47      * @return UCharacterIterator object
     48      * @exception IllegalArgumentException
     49      *                if the argument is null
     50      * @stable ICU 2.4
     51      */
     52     public static final UCharacterIterator getInstance(Replaceable source) {
     53         return new ReplaceableUCharacterIterator(source);
     54     }
     55 
     56     /**
     57      * Returns a <code>UCharacterIterator</code> object given a source string.
     58      *
     59      * @param source
     60      *            a string
     61      * @return UCharacterIterator object
     62      * @exception IllegalArgumentException
     63      *                if the argument is null
     64      * @stable ICU 2.4
     65      */
     66     public static final UCharacterIterator getInstance(String source) {
     67         return new ReplaceableUCharacterIterator(source);
     68     }
     69 
     70     /**
     71      * Returns a <code>UCharacterIterator</code> object given a source character array.
     72      *
     73      * @param source
     74      *            an array of UTF-16 code units
     75      * @return UCharacterIterator object
     76      * @exception IllegalArgumentException
     77      *                if the argument is null
     78      * @stable ICU 2.4
     79      */
     80     public static final UCharacterIterator getInstance(char[] source) {
     81         return getInstance(source, 0, source.length);
     82     }
     83 
     84     /**
     85      * Returns a <code>UCharacterIterator</code> object given a source character array.
     86      *
     87      * @param source
     88      *            an array of UTF-16 code units
     89      * @return UCharacterIterator object
     90      * @exception IllegalArgumentException
     91      *                if the argument is null
     92      * @stable ICU 2.4
     93      */
     94     public static final UCharacterIterator getInstance(char[] source, int start, int limit) {
     95         return new UCharArrayIterator(source, start, limit);
     96     }
     97 
     98     /**
     99      * Returns a <code>UCharacterIterator</code> object given a source StringBuffer.
    100      *
    101      * @param source
    102      *            an string buffer of UTF-16 code units
    103      * @return UCharacterIterator object
    104      * @exception IllegalArgumentException
    105      *                if the argument is null
    106      * @stable ICU 2.4
    107      */
    108     public static final UCharacterIterator getInstance(StringBuffer source) {
    109         return new ReplaceableUCharacterIterator(source);
    110     }
    111 
    112     /**
    113      * Returns a <code>UCharacterIterator</code> object given a CharacterIterator.
    114      *
    115      * @param source
    116      *            a valid CharacterIterator object.
    117      * @return UCharacterIterator object
    118      * @exception IllegalArgumentException
    119      *                if the argument is null
    120      * @stable ICU 2.4
    121      */
    122     public static final UCharacterIterator getInstance(CharacterIterator source) {
    123         return new CharacterIteratorWrapper(source);
    124     }
    125 
    126     // public methods ----------------------------------------------------------
    127     /**
    128      * Returns a <code>java.text.CharacterIterator</code> object for the underlying text of this iterator. The returned
    129      * iterator is independent of this iterator.
    130      *
    131      * @return java.text.CharacterIterator object
    132      * @stable ICU 2.4
    133      */
    134     public CharacterIterator getCharacterIterator() {
    135         return new UCharacterIteratorWrapper(this);
    136     }
    137 
    138     /**
    139      * Returns the code unit at the current index. If index is out of range, returns DONE. Index is not changed.
    140      *
    141      * @return current code unit
    142      * @stable ICU 2.4
    143      */
    144     public abstract int current();
    145 
    146     /**
    147      * Returns the codepoint at the current index. If the current index is invalid, DONE is returned. If the current
    148      * index points to a lead surrogate, and there is a following trail surrogate, then the code point is returned.
    149      * Otherwise, the code unit at index is returned. Index is not changed.
    150      *
    151      * @return current codepoint
    152      * @stable ICU 2.4
    153      */
    154     public int currentCodePoint() {
    155         int ch = current();
    156         if (UTF16.isLeadSurrogate((char) ch)) {
    157             // advance the index to get the
    158             // next code point
    159             next();
    160             // due to post increment semantics
    161             // current() after next() actually
    162             // returns the char we want
    163             int ch2 = current();
    164             // current should never change
    165             // the current index so back off
    166             previous();
    167 
    168             if (UTF16.isTrailSurrogate((char) ch2)) {
    169                 // we found a surrogate pair
    170                 // return the codepoint
    171                 return Character.toCodePoint((char) ch, (char) ch2);
    172             }
    173         }
    174         return ch;
    175     }
    176 
    177     /**
    178      * Returns the length of the text
    179      *
    180      * @return length of the text
    181      * @stable ICU 2.4
    182      */
    183     public abstract int getLength();
    184 
    185     /**
    186      * Gets the current index in text.
    187      *
    188      * @return current index in text.
    189      * @stable ICU 2.4
    190      */
    191     public abstract int getIndex();
    192 
    193     /**
    194      * Returns the UTF16 code unit at index, and increments to the next code unit (post-increment semantics). If index
    195      * is out of range, DONE is returned, and the iterator is reset to the limit of the text.
    196      *
    197      * @return the next UTF16 code unit, or DONE if the index is at the limit of the text.
    198      * @stable ICU 2.4
    199      */
    200     @Override
    201     public abstract int next();
    202 
    203     /**
    204      * Returns the code point at index, and increments to the next code point (post-increment semantics). If index does
    205      * not point to a valid surrogate pair, the behavior is the same as <code>next()</code>. Otherwise the iterator is
    206      * incremented past the surrogate pair, and the code point represented by the pair is returned.
    207      *
    208      * @return the next codepoint in text, or DONE if the index is at the limit of the text.
    209      * @stable ICU 2.4
    210      */
    211     @Override
    212     public int nextCodePoint() {
    213         int ch1 = next();
    214         if (UTF16.isLeadSurrogate((char) ch1)) {
    215             int ch2 = next();
    216             if (UTF16.isTrailSurrogate((char) ch2)) {
    217                 return Character.toCodePoint((char) ch1, (char) ch2);
    218             } else if (ch2 != DONE) {
    219                 // unmatched surrogate so back out
    220                 previous();
    221             }
    222         }
    223         return ch1;
    224     }
    225 
    226     /**
    227      * Decrement to the position of the previous code unit in the text, and return it (pre-decrement semantics). If the
    228      * resulting index is less than 0, the index is reset to 0 and DONE is returned.
    229      *
    230      * @return the previous code unit in the text, or DONE if the new index is before the start of the text.
    231      * @stable ICU 2.4
    232      */
    233     public abstract int previous();
    234 
    235     /**
    236      * Retreat to the start of the previous code point in the text, and return it (pre-decrement semantics). If the
    237      * index is not preceeded by a valid surrogate pair, the behavior is the same as <code>previous()</code>. Otherwise
    238      * the iterator is decremented to the start of the surrogate pair, and the code point represented by the pair is
    239      * returned.
    240      *
    241      * @return the previous code point in the text, or DONE if the new index is before the start of the text.
    242      * @stable ICU 2.4
    243      */
    244     public int previousCodePoint() {
    245         int ch1 = previous();
    246         if (UTF16.isTrailSurrogate((char) ch1)) {
    247             int ch2 = previous();
    248             if (UTF16.isLeadSurrogate((char) ch2)) {
    249                 return Character.toCodePoint((char) ch2, (char) ch1);
    250             } else if (ch2 != DONE) {
    251                 // unmatched trail surrogate so back out
    252                 next();
    253             }
    254         }
    255         return ch1;
    256     }
    257 
    258     /**
    259      * Sets the index to the specified index in the text.
    260      *
    261      * @param index
    262      *            the index within the text.
    263      * @exception IndexOutOfBoundsException
    264      *                is thrown if an invalid index is supplied
    265      * @stable ICU 2.4
    266      */
    267     public abstract void setIndex(int index);
    268 
    269     /**
    270      * Sets the current index to the limit.
    271      *
    272      * @stable ICU 2.4
    273      */
    274     public void setToLimit() {
    275         setIndex(getLength());
    276     }
    277 
    278     /**
    279      * Sets the current index to the start.
    280      *
    281      * @stable ICU 2.4
    282      */
    283     public void setToStart() {
    284         setIndex(0);
    285     }
    286 
    287     /**
    288      * Fills the buffer with the underlying text storage of the iterator If the buffer capacity is not enough a
    289      * exception is thrown. The capacity of the fill in buffer should at least be equal to length of text in the
    290      * iterator obtained by calling <code>getLength()</code>). <b>Usage:</b>
    291      *
    292      * <pre>
    293      *         UChacterIterator iter = new UCharacterIterator.getInstance(text);
    294      *         char[] buf = new char[iter.getLength()];
    295      *         iter.getText(buf);
    296      *
    297      *         OR
    298      *         char[] buf= new char[1];
    299      *         int len = 0;
    300      *         for(;;){
    301      *             try{
    302      *                 len = iter.getText(buf);
    303      *                 break;
    304      *             }catch(IndexOutOfBoundsException e){
    305      *                 buf = new char[iter.getLength()];
    306      *             }
    307      *         }
    308      * </pre>
    309      *
    310      * @param fillIn
    311      *            an array of chars to fill with the underlying UTF-16 code units.
    312      * @param offset
    313      *            the position within the array to start putting the data.
    314      * @return the number of code units added to fillIn, as a convenience
    315      * @exception IndexOutOfBoundsException
    316      *                exception if there is not enough room after offset in the array, or if offset &lt; 0.
    317      * @stable ICU 2.4
    318      */
    319     public abstract int getText(char[] fillIn, int offset);
    320 
    321     /**
    322      * Convenience override for <code>getText(char[], int)</code> that provides an offset of 0.
    323      *
    324      * @param fillIn
    325      *            an array of chars to fill with the underlying UTF-16 code units.
    326      * @return the number of code units added to fillIn, as a convenience
    327      * @exception IndexOutOfBoundsException
    328      *                exception if there is not enough room in the array.
    329      * @stable ICU 2.4
    330      */
    331     public final int getText(char[] fillIn) {
    332         return getText(fillIn, 0);
    333     }
    334 
    335     /**
    336      * Convenience method for returning the underlying text storage as as string
    337      *
    338      * @return the underlying text storage in the iterator as a string
    339      * @stable ICU 2.4
    340      */
    341     public String getText() {
    342         char[] text = new char[getLength()];
    343         getText(text);
    344         return new String(text);
    345     }
    346 
    347     /**
    348      * Moves the current position by the number of code units specified, either forward or backward depending on the
    349      * sign of delta (positive or negative respectively). If the resulting index would be less than zero, the index is
    350      * set to zero, and if the resulting index would be greater than limit, the index is set to limit.
    351      *
    352      * @param delta
    353      *            the number of code units to move the current index.
    354      * @return the new index.
    355      * @exception IndexOutOfBoundsException
    356      *                is thrown if an invalid index is supplied
    357      * @stable ICU 2.4
    358      *
    359      */
    360     public int moveIndex(int delta) {
    361         int x = Math.max(0, Math.min(getIndex() + delta, getLength()));
    362         setIndex(x);
    363         return x;
    364     }
    365 
    366     /**
    367      * Moves the current position by the number of code points specified, either forward or backward depending on the
    368      * sign of delta (positive or negative respectively). If the current index is at a trail surrogate then the first
    369      * adjustment is by code unit, and the remaining adjustments are by code points. If the resulting index would be
    370      * less than zero, the index is set to zero, and if the resulting index would be greater than limit, the index is
    371      * set to limit.
    372      *
    373      * @param delta
    374      *            the number of code units to move the current index.
    375      * @return the new index
    376      * @exception IndexOutOfBoundsException
    377      *                is thrown if an invalid delta is supplied
    378      * @stable ICU 2.4
    379      */
    380     public int moveCodePointIndex(int delta) {
    381         if (delta > 0) {
    382             while (delta > 0 && nextCodePoint() != DONE) {
    383                 delta--;
    384             }
    385         } else {
    386             while (delta < 0 && previousCodePoint() != DONE) {
    387                 delta++;
    388             }
    389         }
    390         if (delta != 0) {
    391             throw new IndexOutOfBoundsException();
    392         }
    393 
    394         return getIndex();
    395     }
    396 
    397     /**
    398      * Creates a copy of this iterator, independent from other iterators. If it is not possible to clone the iterator,
    399      * returns null.
    400      *
    401      * @return copy of this iterator
    402      * @stable ICU 2.4
    403      */
    404     @Override
    405     public Object clone() throws CloneNotSupportedException {
    406         return super.clone();
    407     }
    408 
    409 }
    410