Home | History | Annotate | Download | only in method
      1 
      2 /*
      3  * Copyright (C) 2011 The Android Open Source Project
      4  *
      5  * Licensed under the Apache License, Version 2.0 (the "License");
      6  * you may not use this file except in compliance with the License.
      7  * You may obtain a copy of the License at
      8  *
      9  *      http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  */
     17 
     18 package android.text.method;
     19 
     20 import android.text.Selection;
     21 import android.text.SpannableStringBuilder;
     22 
     23 import android.icu.text.BreakIterator;
     24 import java.util.Locale;
     25 
     26 /**
     27  * Walks through cursor positions at word boundaries. Internally uses
     28  * {@link BreakIterator#getWordInstance()}, and caches {@link CharSequence}
     29  * for performance reasons.
     30  *
     31  * Also provides methods to determine word boundaries.
     32  * {@hide}
     33  */
     34 public class WordIterator implements Selection.PositionIterator {
     35     // Size of the window for the word iterator, should be greater than the longest word's length
     36     private static final int WINDOW_WIDTH = 50;
     37 
     38     private String mString;
     39     private int mOffsetShift;
     40 
     41     private BreakIterator mIterator;
     42 
     43     /**
     44      * Constructs a WordIterator using the default locale.
     45      */
     46     public WordIterator() {
     47         this(Locale.getDefault());
     48     }
     49 
     50     /**
     51      * Constructs a new WordIterator for the specified locale.
     52      * @param locale The locale to be used when analysing the text.
     53      */
     54     public WordIterator(Locale locale) {
     55         mIterator = BreakIterator.getWordInstance(locale);
     56     }
     57 
     58     public void setCharSequence(CharSequence charSequence, int start, int end) {
     59         mOffsetShift = Math.max(0, start - WINDOW_WIDTH);
     60         final int windowEnd = Math.min(charSequence.length(), end + WINDOW_WIDTH);
     61 
     62         if (charSequence instanceof SpannableStringBuilder) {
     63             mString = ((SpannableStringBuilder) charSequence).substring(mOffsetShift, windowEnd);
     64         } else {
     65             mString = charSequence.subSequence(mOffsetShift, windowEnd).toString();
     66         }
     67         mIterator.setText(mString);
     68     }
     69 
     70     /** {@inheritDoc} */
     71     public int preceding(int offset) {
     72         int shiftedOffset = offset - mOffsetShift;
     73         do {
     74             shiftedOffset = mIterator.preceding(shiftedOffset);
     75             if (shiftedOffset == BreakIterator.DONE) {
     76                 return BreakIterator.DONE;
     77             }
     78             if (isOnLetterOrDigit(shiftedOffset)) {
     79                 return shiftedOffset + mOffsetShift;
     80             }
     81         } while (true);
     82     }
     83 
     84     /** {@inheritDoc} */
     85     public int following(int offset) {
     86         int shiftedOffset = offset - mOffsetShift;
     87         do {
     88             shiftedOffset = mIterator.following(shiftedOffset);
     89             if (shiftedOffset == BreakIterator.DONE) {
     90                 return BreakIterator.DONE;
     91             }
     92             if (isAfterLetterOrDigit(shiftedOffset)) {
     93                 return shiftedOffset + mOffsetShift;
     94             }
     95         } while (true);
     96     }
     97 
     98     /** {@inheritDoc} */
     99     public boolean isBoundary(int offset) {
    100         int shiftedOffset = offset - mOffsetShift;
    101         checkOffsetIsValid(shiftedOffset);
    102         return mIterator.isBoundary(shiftedOffset);
    103     }
    104 
    105     /**
    106      * Returns the position of next boundary after the given offset. Returns
    107      * {@code DONE} if there is no boundary after the given offset.
    108      *
    109      * @param offset the given start position to search from.
    110      * @return the position of the last boundary preceding the given offset.
    111      */
    112     public int nextBoundary(int offset) {
    113         int shiftedOffset = offset - mOffsetShift;
    114         shiftedOffset = mIterator.following(shiftedOffset);
    115         if (shiftedOffset == BreakIterator.DONE) {
    116             return BreakIterator.DONE;
    117         }
    118         return shiftedOffset + mOffsetShift;
    119     }
    120 
    121     /**
    122      * Returns the position of boundary preceding the given offset or
    123      * {@code DONE} if the given offset specifies the starting position.
    124      *
    125      * @param offset the given start position to search from.
    126      * @return the position of the last boundary preceding the given offset.
    127      */
    128     public int prevBoundary(int offset) {
    129         int shiftedOffset = offset - mOffsetShift;
    130         shiftedOffset = mIterator.preceding(shiftedOffset);
    131         if (shiftedOffset == BreakIterator.DONE) {
    132             return BreakIterator.DONE;
    133         }
    134         return shiftedOffset + mOffsetShift;
    135     }
    136 
    137     /** If <code>offset</code> is within a word, returns the index of the first character of that
    138      * word, otherwise returns BreakIterator.DONE.
    139      *
    140      * The offsets that are considered to be part of a word are the indexes of its characters,
    141      * <i>as well as</i> the index of its last character plus one.
    142      * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned.
    143      *
    144      * Valid range for offset is [0..textLength] (note the inclusive upper bound).
    145      * The returned value is within [0..offset] or BreakIterator.DONE.
    146      *
    147      * @throws IllegalArgumentException is offset is not valid.
    148      */
    149     public int getBeginning(int offset) {
    150         // TODO: Check if usage of this can be updated to getBeginning(offset, true) if
    151         // so this method can be removed.
    152         return getBeginning(offset, false);
    153     }
    154 
    155     /**
    156      * If <code>offset</code> is within a word, returns the index of the last character of that
    157      * word plus one, otherwise returns BreakIterator.DONE.
    158      *
    159      * The offsets that are considered to be part of a word are the indexes of its characters,
    160      * <i>as well as</i> the index of its last character plus one.
    161      * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned.
    162      *
    163      * Valid range for offset is [0..textLength] (note the inclusive upper bound).
    164      * The returned value is within [offset..textLength] or BreakIterator.DONE.
    165      *
    166      * @throws IllegalArgumentException is offset is not valid.
    167      */
    168     public int getEnd(int offset) {
    169         // TODO: Check if usage of this can be updated to getEnd(offset, true), if
    170         // so this method can be removed.
    171         return getEnd(offset, false);
    172     }
    173 
    174     /**
    175      * If the <code>offset</code> is within a word or on a word boundary that can only be
    176      * considered the start of a word (e.g. _word where "_" is any character that would not
    177      * be considered part of the word) then this returns the index of the first character of
    178      * that word.
    179      *
    180      * If the offset is on a word boundary that can be considered the start and end of a
    181      * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
    182      * between AA and BB, this would return the start of the previous word, AA.
    183      *
    184      * Returns BreakIterator.DONE if there is no previous boundary.
    185      *
    186      * @throws IllegalArgumentException is offset is not valid.
    187      */
    188     public int getPrevWordBeginningOnTwoWordsBoundary(int offset) {
    189         return getBeginning(offset, true);
    190     }
    191 
    192     /**
    193      * If the <code>offset</code> is within a word or on a word boundary that can only be
    194      * considered the end of a word (e.g. word_ where "_" is any character that would not
    195      * be considered part of the word) then this returns the index of the last character
    196      * plus one of that word.
    197      *
    198      * If the offset is on a word boundary that can be considered the start and end of a
    199      * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
    200      * between AA and BB, this would return the end of the next word, BB.
    201      *
    202      * Returns BreakIterator.DONE if there is no next boundary.
    203      *
    204      * @throws IllegalArgumentException is offset is not valid.
    205      */
    206     public int getNextWordEndOnTwoWordBoundary(int offset) {
    207         return getEnd(offset, true);
    208     }
    209 
    210     /**
    211      * If the <code>offset</code> is within a word or on a word boundary that can only be
    212      * considered the start of a word (e.g. _word where "_" is any character that would not
    213      * be considered part of the word) then this returns the index of the first character of
    214      * that word.
    215      *
    216      * If the offset is on a word boundary that can be considered the start and end of a
    217      * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
    218      * between AA and BB, and getPrevWordBeginningOnTwoWordsBoundary is true then this would
    219      * return the start of the previous word, AA. Otherwise it would return the current offset,
    220      * the start of BB.
    221      *
    222      * Returns BreakIterator.DONE if there is no previous boundary.
    223      *
    224      * @throws IllegalArgumentException is offset is not valid.
    225      */
    226     private int getBeginning(int offset, boolean getPrevWordBeginningOnTwoWordsBoundary) {
    227         final int shiftedOffset = offset - mOffsetShift;
    228         checkOffsetIsValid(shiftedOffset);
    229 
    230         if (isOnLetterOrDigit(shiftedOffset)) {
    231             if (mIterator.isBoundary(shiftedOffset)
    232                     && (!isAfterLetterOrDigit(shiftedOffset)
    233                             || !getPrevWordBeginningOnTwoWordsBoundary)) {
    234                 return shiftedOffset + mOffsetShift;
    235             } else {
    236                 return mIterator.preceding(shiftedOffset) + mOffsetShift;
    237             }
    238         } else {
    239             if (isAfterLetterOrDigit(shiftedOffset)) {
    240                 return mIterator.preceding(shiftedOffset) + mOffsetShift;
    241             }
    242         }
    243         return BreakIterator.DONE;
    244     }
    245 
    246     /**
    247      * If the <code>offset</code> is within a word or on a word boundary that can only be
    248      * considered the end of a word (e.g. word_ where "_" is any character that would not be
    249      * considered part of the word) then this returns the index of the last character plus one
    250      * of that word.
    251      *
    252      * If the offset is on a word boundary that can be considered the start and end of a
    253      * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
    254      * between AA and BB, and getNextWordEndOnTwoWordBoundary is true then this would return
    255      * the end of the next word, BB. Otherwise it would return the current offset, the end
    256      * of AA.
    257      *
    258      * Returns BreakIterator.DONE if there is no next boundary.
    259      *
    260      * @throws IllegalArgumentException is offset is not valid.
    261      */
    262     private int getEnd(int offset, boolean getNextWordEndOnTwoWordBoundary) {
    263         final int shiftedOffset = offset - mOffsetShift;
    264         checkOffsetIsValid(shiftedOffset);
    265 
    266         if (isAfterLetterOrDigit(shiftedOffset)) {
    267             if (mIterator.isBoundary(shiftedOffset)
    268                     && (!isOnLetterOrDigit(shiftedOffset) || !getNextWordEndOnTwoWordBoundary)) {
    269                 return shiftedOffset + mOffsetShift;
    270             } else {
    271                 return mIterator.following(shiftedOffset) + mOffsetShift;
    272             }
    273         } else {
    274             if (isOnLetterOrDigit(shiftedOffset)) {
    275                 return mIterator.following(shiftedOffset) + mOffsetShift;
    276             }
    277         }
    278         return BreakIterator.DONE;
    279     }
    280 
    281     /**
    282      * If <code>offset</code> is within a group of punctuation as defined
    283      * by {@link #isPunctuation(int)}, returns the index of the first character
    284      * of that group, otherwise returns BreakIterator.DONE.
    285      *
    286      * @param offset the offset to search from.
    287      */
    288     public int getPunctuationBeginning(int offset) {
    289         while (offset != BreakIterator.DONE && !isPunctuationStartBoundary(offset)) {
    290             offset = prevBoundary(offset);
    291         }
    292         // No need to shift offset, prevBoundary handles that.
    293         return offset;
    294     }
    295 
    296     /**
    297      * If <code>offset</code> is within a group of punctuation as defined
    298      * by {@link #isPunctuation(int)}, returns the index of the last character
    299      * of that group plus one, otherwise returns BreakIterator.DONE.
    300      *
    301      * @param offset the offset to search from.
    302      */
    303     public int getPunctuationEnd(int offset) {
    304         while (offset != BreakIterator.DONE && !isPunctuationEndBoundary(offset)) {
    305             offset = nextBoundary(offset);
    306         }
    307         // No need to shift offset, nextBoundary handles that.
    308         return offset;
    309     }
    310 
    311     /**
    312      * Indicates if the provided offset is after a punctuation character
    313      * as defined by {@link #isPunctuation(int)}.
    314      *
    315      * @param offset the offset to check from.
    316      * @return Whether the offset is after a punctuation character.
    317      */
    318     public boolean isAfterPunctuation(int offset) {
    319         final int shiftedOffset = offset - mOffsetShift;
    320         if (shiftedOffset >= 1 && shiftedOffset <= mString.length()) {
    321             final int codePoint = mString.codePointBefore(shiftedOffset);
    322             return isPunctuation(codePoint);
    323         }
    324         return false;
    325     }
    326 
    327     /**
    328      * Indicates if the provided offset is at a punctuation character
    329      * as defined by {@link #isPunctuation(int)}.
    330      *
    331      * @param offset the offset to check from.
    332      * @return Whether the offset is at a punctuation character.
    333      */
    334     public boolean isOnPunctuation(int offset) {
    335         final int shiftedOffset = offset - mOffsetShift;
    336         if (shiftedOffset >= 0 && shiftedOffset < mString.length()) {
    337             final int codePoint = mString.codePointAt(shiftedOffset);
    338             return isPunctuation(codePoint);
    339         }
    340         return false;
    341     }
    342 
    343     private boolean isPunctuationStartBoundary(int offset) {
    344         return isOnPunctuation(offset) && !isAfterPunctuation(offset);
    345     }
    346 
    347     private boolean isPunctuationEndBoundary(int offset) {
    348         return !isOnPunctuation(offset) && isAfterPunctuation(offset);
    349     }
    350 
    351     private boolean isPunctuation(int cp) {
    352         int type = Character.getType(cp);
    353         return (type == Character.CONNECTOR_PUNCTUATION ||
    354                 type == Character.DASH_PUNCTUATION ||
    355                 type == Character.END_PUNCTUATION ||
    356                 type == Character.FINAL_QUOTE_PUNCTUATION ||
    357                 type == Character.INITIAL_QUOTE_PUNCTUATION ||
    358                 type == Character.OTHER_PUNCTUATION ||
    359                 type == Character.START_PUNCTUATION);
    360     }
    361 
    362     private boolean isAfterLetterOrDigit(int shiftedOffset) {
    363         if (shiftedOffset >= 1 && shiftedOffset <= mString.length()) {
    364             final int codePoint = mString.codePointBefore(shiftedOffset);
    365             if (Character.isLetterOrDigit(codePoint)) return true;
    366         }
    367         return false;
    368     }
    369 
    370     private boolean isOnLetterOrDigit(int shiftedOffset) {
    371         if (shiftedOffset >= 0 && shiftedOffset < mString.length()) {
    372             final int codePoint = mString.codePointAt(shiftedOffset);
    373             if (Character.isLetterOrDigit(codePoint)) return true;
    374         }
    375         return false;
    376     }
    377 
    378     private void checkOffsetIsValid(int shiftedOffset) {
    379         if (shiftedOffset < 0 || shiftedOffset > mString.length()) {
    380             throw new IllegalArgumentException("Invalid offset: " + (shiftedOffset + mOffsetShift) +
    381                     ". Valid range is [" + mOffsetShift + ", " + (mString.length() + mOffsetShift) +
    382                     "]");
    383         }
    384     }
    385 }
    386