Home | History | Annotate | Download | only in method
      1 /*
      2  * Copyright (C) 2011 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package android.text.method;
     18 
     19 import android.annotation.NonNull;
     20 import android.icu.lang.UCharacter;
     21 import android.icu.lang.UProperty;
     22 import android.icu.text.BreakIterator;
     23 import android.text.CharSequenceCharacterIterator;
     24 import android.text.Selection;
     25 
     26 import java.util.Locale;
     27 
     28 /**
     29  * Walks through cursor positions at word boundaries. Internally uses
     30  * {@link BreakIterator#getWordInstance()}, and caches {@link CharSequence}
     31  * for performance reasons.
     32  *
     33  * Also provides methods to determine word boundaries.
     34  * {@hide}
     35  */
     36 public class WordIterator implements Selection.PositionIterator {
     37     // Size of the window for the word iterator, should be greater than the longest word's length
     38     private static final int WINDOW_WIDTH = 50;
     39 
     40     private int mStart, mEnd;
     41     private CharSequence mCharSeq;
     42     private final BreakIterator mIterator;
     43 
     44     /**
     45      * Constructs a WordIterator using the default locale.
     46      */
     47     public WordIterator() {
     48         this(Locale.getDefault());
     49     }
     50 
     51     /**
     52      * Constructs a new WordIterator for the specified locale.
     53      * @param locale The locale to be used for analyzing the text.
     54      */
     55     public WordIterator(Locale locale) {
     56         mIterator = BreakIterator.getWordInstance(locale);
     57     }
     58 
     59     public void setCharSequence(@NonNull CharSequence charSequence, int start, int end) {
     60         if (0 <= start && end <= charSequence.length()) {
     61             mCharSeq = charSequence;
     62             mStart = Math.max(0, start - WINDOW_WIDTH);
     63             mEnd = Math.min(charSequence.length(), end + WINDOW_WIDTH);
     64             mIterator.setText(new CharSequenceCharacterIterator(charSequence, mStart, mEnd));
     65         } else {
     66             throw new IndexOutOfBoundsException("input indexes are outside the CharSequence");
     67         }
     68     }
     69 
     70     /** {@inheritDoc} */
     71     public int preceding(int offset) {
     72         checkOffsetIsValid(offset);
     73         while (true) {
     74             offset = mIterator.preceding(offset);
     75             if (offset == BreakIterator.DONE || isOnLetterOrDigit(offset)) {
     76                 return offset;
     77             }
     78         }
     79     }
     80 
     81     /** {@inheritDoc} */
     82     public int following(int offset) {
     83         checkOffsetIsValid(offset);
     84         while (true) {
     85             offset = mIterator.following(offset);
     86             if (offset == BreakIterator.DONE || isAfterLetterOrDigit(offset)) {
     87                 return offset;
     88             }
     89         }
     90     }
     91 
     92     /** {@inheritDoc} */
     93     public boolean isBoundary(int offset) {
     94         checkOffsetIsValid(offset);
     95         return mIterator.isBoundary(offset);
     96     }
     97 
     98     /**
     99      * Returns the position of next boundary after the given offset. Returns
    100      * {@code DONE} if there is no boundary after the given offset.
    101      *
    102      * @param offset the given start position to search from.
    103      * @return the position of the last boundary preceding the given offset.
    104      */
    105     public int nextBoundary(int offset) {
    106         checkOffsetIsValid(offset);
    107         return mIterator.following(offset);
    108     }
    109 
    110     /**
    111      * Returns the position of boundary preceding the given offset or
    112      * {@code DONE} if the given offset specifies the starting position.
    113      *
    114      * @param offset the given start position to search from.
    115      * @return the position of the last boundary preceding the given offset.
    116      */
    117     public int prevBoundary(int offset) {
    118         checkOffsetIsValid(offset);
    119         return mIterator.preceding(offset);
    120     }
    121 
    122     /** If <code>offset</code> is within a word, returns the index of the first character of that
    123      * word, otherwise returns BreakIterator.DONE.
    124      *
    125      * The offsets that are considered to be part of a word are the indexes of its characters,
    126      * <i>as well as</i> the index of its last character plus one.
    127      * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned.
    128      *
    129      * Valid range for offset is [0..textLength] (note the inclusive upper bound).
    130      * The returned value is within [0..offset] or BreakIterator.DONE.
    131      *
    132      * @throws IllegalArgumentException is offset is not valid.
    133      */
    134     public int getBeginning(int offset) {
    135         // TODO: Check if usage of this can be updated to getBeginning(offset, true) if
    136         // so this method can be removed.
    137         return getBeginning(offset, false);
    138     }
    139 
    140     /**
    141      * If <code>offset</code> is within a word, returns the index of the last character of that
    142      * word plus one, otherwise returns BreakIterator.DONE.
    143      *
    144      * The offsets that are considered to be part of a word are the indexes of its characters,
    145      * <i>as well as</i> the index of its last character plus one.
    146      * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned.
    147      *
    148      * Valid range for offset is [0..textLength] (note the inclusive upper bound).
    149      * The returned value is within [offset..textLength] or BreakIterator.DONE.
    150      *
    151      * @throws IllegalArgumentException is offset is not valid.
    152      */
    153     public int getEnd(int offset) {
    154         // TODO: Check if usage of this can be updated to getEnd(offset, true), if
    155         // so this method can be removed.
    156         return getEnd(offset, false);
    157     }
    158 
    159     /**
    160      * If the <code>offset</code> is within a word or on a word boundary that can only be
    161      * considered the start of a word (e.g. _word where "_" is any character that would not
    162      * be considered part of the word) then this returns the index of the first character of
    163      * that word.
    164      *
    165      * If the offset is on a word boundary that can be considered the start and end of a
    166      * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
    167      * between AA and BB, this would return the start of the previous word, AA.
    168      *
    169      * Returns BreakIterator.DONE if there is no previous boundary.
    170      *
    171      * @throws IllegalArgumentException is offset is not valid.
    172      */
    173     public int getPrevWordBeginningOnTwoWordsBoundary(int offset) {
    174         return getBeginning(offset, true);
    175     }
    176 
    177     /**
    178      * If the <code>offset</code> is within a word or on a word boundary that can only be
    179      * considered the end of a word (e.g. word_ where "_" is any character that would not
    180      * be considered part of the word) then this returns the index of the last character
    181      * plus one of that word.
    182      *
    183      * If the offset is on a word boundary that can be considered the start and end of a
    184      * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
    185      * between AA and BB, this would return the end of the next word, BB.
    186      *
    187      * Returns BreakIterator.DONE if there is no next boundary.
    188      *
    189      * @throws IllegalArgumentException is offset is not valid.
    190      */
    191     public int getNextWordEndOnTwoWordBoundary(int offset) {
    192         return getEnd(offset, true);
    193     }
    194 
    195     /**
    196      * If the <code>offset</code> is within a word or on a word boundary that can only be
    197      * considered the start of a word (e.g. _word where "_" is any character that would not
    198      * be considered part of the word) then this returns the index of the first character of
    199      * that word.
    200      *
    201      * If the offset is on a word boundary that can be considered the start and end of a
    202      * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
    203      * between AA and BB, and getPrevWordBeginningOnTwoWordsBoundary is true then this would
    204      * return the start of the previous word, AA. Otherwise it would return the current offset,
    205      * the start of BB.
    206      *
    207      * Returns BreakIterator.DONE if there is no previous boundary.
    208      *
    209      * @throws IllegalArgumentException is offset is not valid.
    210      */
    211     private int getBeginning(int offset, boolean getPrevWordBeginningOnTwoWordsBoundary) {
    212         checkOffsetIsValid(offset);
    213 
    214         if (isOnLetterOrDigit(offset)) {
    215             if (mIterator.isBoundary(offset)
    216                     && (!isAfterLetterOrDigit(offset)
    217                             || !getPrevWordBeginningOnTwoWordsBoundary)) {
    218                 return offset;
    219             } else {
    220                 return mIterator.preceding(offset);
    221             }
    222         } else {
    223             if (isAfterLetterOrDigit(offset)) {
    224                 return mIterator.preceding(offset);
    225             }
    226         }
    227         return BreakIterator.DONE;
    228     }
    229 
    230     /**
    231      * If the <code>offset</code> is within a word or on a word boundary that can only be
    232      * considered the end of a word (e.g. word_ where "_" is any character that would not be
    233      * considered part of the word) then this returns the index of the last character plus one
    234      * of that word.
    235      *
    236      * If the offset is on a word boundary that can be considered the start and end of a
    237      * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
    238      * between AA and BB, and getNextWordEndOnTwoWordBoundary is true then this would return
    239      * the end of the next word, BB. Otherwise it would return the current offset, the end
    240      * of AA.
    241      *
    242      * Returns BreakIterator.DONE if there is no next boundary.
    243      *
    244      * @throws IllegalArgumentException is offset is not valid.
    245      */
    246     private int getEnd(int offset, boolean getNextWordEndOnTwoWordBoundary) {
    247         checkOffsetIsValid(offset);
    248 
    249         if (isAfterLetterOrDigit(offset)) {
    250             if (mIterator.isBoundary(offset)
    251                     && (!isOnLetterOrDigit(offset) || !getNextWordEndOnTwoWordBoundary)) {
    252                 return offset;
    253             } else {
    254                 return mIterator.following(offset);
    255             }
    256         } else {
    257             if (isOnLetterOrDigit(offset)) {
    258                 return mIterator.following(offset);
    259             }
    260         }
    261         return BreakIterator.DONE;
    262     }
    263 
    264     /**
    265      * If <code>offset</code> is within a group of punctuation as defined
    266      * by {@link #isPunctuation(int)}, returns the index of the first character
    267      * of that group, otherwise returns BreakIterator.DONE.
    268      *
    269      * @param offset the offset to search from.
    270      */
    271     public int getPunctuationBeginning(int offset) {
    272         checkOffsetIsValid(offset);
    273         while (offset != BreakIterator.DONE && !isPunctuationStartBoundary(offset)) {
    274             offset = prevBoundary(offset);
    275         }
    276         // No need to shift offset, prevBoundary handles that.
    277         return offset;
    278     }
    279 
    280     /**
    281      * If <code>offset</code> is within a group of punctuation as defined
    282      * by {@link #isPunctuation(int)}, returns the index of the last character
    283      * of that group plus one, otherwise returns BreakIterator.DONE.
    284      *
    285      * @param offset the offset to search from.
    286      */
    287     public int getPunctuationEnd(int offset) {
    288         checkOffsetIsValid(offset);
    289         while (offset != BreakIterator.DONE && !isPunctuationEndBoundary(offset)) {
    290             offset = nextBoundary(offset);
    291         }
    292         // No need to shift offset, nextBoundary handles that.
    293         return offset;
    294     }
    295 
    296     /**
    297      * Indicates if the provided offset is after a punctuation character
    298      * as defined by {@link #isPunctuation(int)}.
    299      *
    300      * @param offset the offset to check from.
    301      * @return Whether the offset is after a punctuation character.
    302      */
    303     public boolean isAfterPunctuation(int offset) {
    304         if (mStart < offset && offset <= mEnd) {
    305             final int codePoint = Character.codePointBefore(mCharSeq, offset);
    306             return isPunctuation(codePoint);
    307         }
    308         return false;
    309     }
    310 
    311     /**
    312      * Indicates if the provided offset is at a punctuation character
    313      * as defined by {@link #isPunctuation(int)}.
    314      *
    315      * @param offset the offset to check from.
    316      * @return Whether the offset is at a punctuation character.
    317      */
    318     public boolean isOnPunctuation(int offset) {
    319         if (mStart <= offset && offset < mEnd) {
    320             final int codePoint = Character.codePointAt(mCharSeq, offset);
    321             return isPunctuation(codePoint);
    322         }
    323         return false;
    324     }
    325 
    326     /**
    327      * Indicates if the codepoint is a mid-word-only punctuation.
    328      *
    329      * At the moment, this is locale-independent, and includes all the characters in
    330      * the MidLetter, MidNumLet, and Single_Quote class of Unicode word breaking algorithm (see
    331      * UAX #29 "Unicode Text Segmentation" at http://unicode.org/reports/tr29/). These are all the
    332      * characters that according to the rules WB6 and WB7 of UAX #29 prevent word breaks if they are
    333      * in the middle of a word, but they become word breaks if they happen at the end of a word
    334      * (accroding to rule WB999 that breaks word in any place that is not prohibited otherwise).
    335      *
    336      * @param locale the locale to consider the codepoint in. Presently ignored.
    337      * @param codePoint the codepoint to check.
    338      * @return True if the codepoint is a mid-word punctuation.
    339      */
    340     public static boolean isMidWordPunctuation(Locale locale, int codePoint) {
    341         final int wb = UCharacter.getIntPropertyValue(codePoint, UProperty.WORD_BREAK);
    342         return (wb == UCharacter.WordBreak.MIDLETTER
    343                 || wb == UCharacter.WordBreak.MIDNUMLET
    344                 || wb == UCharacter.WordBreak.SINGLE_QUOTE);
    345     }
    346 
    347     private boolean isPunctuationStartBoundary(int offset) {
    348         return isOnPunctuation(offset) && !isAfterPunctuation(offset);
    349     }
    350 
    351     private boolean isPunctuationEndBoundary(int offset) {
    352         return !isOnPunctuation(offset) && isAfterPunctuation(offset);
    353     }
    354 
    355     private static boolean isPunctuation(int cp) {
    356         final int type = Character.getType(cp);
    357         return (type == Character.CONNECTOR_PUNCTUATION
    358                 || type == Character.DASH_PUNCTUATION
    359                 || type == Character.END_PUNCTUATION
    360                 || type == Character.FINAL_QUOTE_PUNCTUATION
    361                 || type == Character.INITIAL_QUOTE_PUNCTUATION
    362                 || type == Character.OTHER_PUNCTUATION
    363                 || type == Character.START_PUNCTUATION);
    364     }
    365 
    366     private boolean isAfterLetterOrDigit(int offset) {
    367         if (mStart < offset && offset <= mEnd) {
    368             final int codePoint = Character.codePointBefore(mCharSeq, offset);
    369             if (Character.isLetterOrDigit(codePoint)) return true;
    370         }
    371         return false;
    372     }
    373 
    374     private boolean isOnLetterOrDigit(int offset) {
    375         if (mStart <= offset && offset < mEnd) {
    376             final int codePoint = Character.codePointAt(mCharSeq, offset);
    377             if (Character.isLetterOrDigit(codePoint)) return true;
    378         }
    379         return false;
    380     }
    381 
    382     private void checkOffsetIsValid(int offset) {
    383         if (!(mStart <= offset && offset <= mEnd)) {
    384             throw new IllegalArgumentException("Invalid offset: " + (offset) +
    385                     ". Valid range is [" + mStart + ", " + mEnd + "]");
    386         }
    387     }
    388 }
    389