Home | History | Annotate | Download | only in text
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /*
      5  *******************************************************************************
      6  * Copyright (C) 2014, International Business Machines Corporation and         *
      7  * others. All Rights Reserved.                                                *
      8  *******************************************************************************
      9  */
     10 package android.icu.text;
     11 
     12 import java.text.CharacterIterator;
     13 import java.util.BitSet;
     14 
     15 import android.icu.impl.CharacterIteration;
     16 
     17 abstract class DictionaryBreakEngine implements LanguageBreakEngine {
     18 
     19     /* Helper class for improving readability of the Thai/Lao/Khmer word break
     20      * algorithm.
     21      */
     22     static class PossibleWord {
     23         // List size, limited by the maximum number of words in the dictionary
     24         // that form a nested sequence.
     25         private final static int POSSIBLE_WORD_LIST_MAX = 20;
     26         //list of word candidate lengths, in increasing length order
     27         private int lengths[];
     28         private int count[];    // Count of candidates
     29         private int prefix;     // The longest match with a dictionary word
     30         private int offset;     // Offset in the text of these candidates
     31         private int mark;       // The preferred candidate's offset
     32         private int current;    // The candidate we're currently looking at
     33 
     34         // Default constructor
     35         public PossibleWord() {
     36             lengths = new int[POSSIBLE_WORD_LIST_MAX];
     37             count = new int[1]; // count needs to be an array of 1 so that it can be pass as reference
     38             offset = -1;
     39         }
     40 
     41         // Fill the list of candidates if needed, select the longest, and return the number found
     42         public int candidates(CharacterIterator fIter, DictionaryMatcher dict, int rangeEnd) {
     43             int start = fIter.getIndex();
     44             if (start != offset) {
     45                 offset = start;
     46                 prefix = dict.matches(fIter, rangeEnd - start, lengths, count, lengths.length);
     47                 // Dictionary leaves text after longest prefix, not longest word. Back up.
     48                 if (count[0] <= 0) {
     49                     fIter.setIndex(start);
     50                 }
     51             }
     52             if (count[0] > 0) {
     53                 fIter.setIndex(start + lengths[count[0]-1]);
     54             }
     55             current = count[0] - 1;
     56             mark = current;
     57             return count[0];
     58         }
     59 
     60         // Select the currently marked candidate, point after it in the text, and invalidate self
     61         public int acceptMarked(CharacterIterator fIter) {
     62             fIter.setIndex(offset + lengths[mark]);
     63             return lengths[mark];
     64         }
     65 
     66         // Backup from the current candidate to the next shorter one; return true if that exists
     67         // and point the text after it
     68         public boolean backUp(CharacterIterator fIter) {
     69             if (current > 0) {
     70                 fIter.setIndex(offset + lengths[--current]);
     71                 return true;
     72             }
     73             return false;
     74         }
     75 
     76         // Return the longest prefix this candidate location shares with a dictionary word
     77         public int longestPrefix() {
     78             return prefix;
     79         }
     80 
     81         // Mark the current candidate as the one we like
     82         public void markCurrent() {
     83             mark = current;
     84         }
     85     }
     86 
     87     /**
     88      *  A deque-like structure holding raw ints.
     89      *  Partial, limited implementation, only what is needed by the dictionary implementation.
     90      *  For internal use only.
     91      * @hide draft / provisional / internal are hidden on Android
     92      */
     93     static class DequeI implements Cloneable {
     94         private int[] data = new int[50];
     95         private int lastIdx = 4;   // or base of stack. Index of element.
     96         private int firstIdx = 4;  // or Top of Stack. Index of element + 1.
     97 
     98         @Override
     99         public Object clone() throws CloneNotSupportedException {
    100             DequeI result = (DequeI)super.clone();
    101             result.data = data.clone();
    102             return result;
    103         }
    104 
    105         int size() {
    106             return firstIdx - lastIdx;
    107         }
    108 
    109         boolean isEmpty() {
    110             return size() == 0;
    111         }
    112 
    113         private void grow() {
    114             int[] newData = new int[data.length * 2];
    115             System.arraycopy(data,  0,  newData,  0, data.length);
    116             data = newData;
    117         }
    118 
    119         void offer(int v) {
    120             // Note that the actual use cases of offer() add at most one element.
    121             //   We make no attempt to handle more than a few.
    122             assert lastIdx > 0;
    123             data[--lastIdx] = v;
    124         }
    125 
    126         void push(int v) {
    127             if (firstIdx >= data.length) {
    128                 grow();
    129             }
    130             data[firstIdx++] = v;
    131         }
    132 
    133         int pop() {
    134             assert size() > 0;
    135             return data[--firstIdx];
    136         }
    137 
    138         int peek() {
    139             assert size() > 0;
    140             return data[firstIdx - 1];
    141         }
    142 
    143         int peekLast() {
    144             assert size() > 0;
    145             return data[lastIdx];
    146         }
    147 
    148         int pollLast() {
    149             assert size() > 0;
    150             return data[lastIdx++];
    151         }
    152 
    153         boolean contains(int v) {
    154             for (int i=lastIdx; i< firstIdx; i++) {
    155                 if (data[i] == v) {
    156                     return true;
    157                 }
    158             }
    159             return false;
    160         }
    161 
    162         int elementAt(int i) {
    163             assert i < size();
    164             return data[lastIdx + i];
    165         }
    166 
    167         void removeAllElements() {
    168             lastIdx = firstIdx = 4;
    169         }
    170     }
    171 
    172     UnicodeSet fSet = new UnicodeSet();
    173     private BitSet fTypes = new BitSet(32);
    174 
    175     /**
    176      * @param breakTypes The types of break iterators that can use this engine.
    177      *  For example, BreakIterator.KIND_LINE
    178      */
    179     public DictionaryBreakEngine(Integer... breakTypes) {
    180         for (Integer type: breakTypes) {
    181             fTypes.set(type);
    182         }
    183     }
    184 
    185     @Override
    186     public boolean handles(int c, int breakType) {
    187         return fTypes.get(breakType) &&  // this type can use us
    188                 fSet.contains(c);        // we recognize the character
    189     }
    190 
    191     @Override
    192     public int findBreaks(CharacterIterator text, int startPos, int endPos,
    193             int breakType, DequeI foundBreaks) {
    194         int result = 0;
    195 
    196          // Find the span of characters included in the set.
    197          //   The span to break begins at the current position int the text, and
    198          //   extends towards the start or end of the text, depending on 'reverse'.
    199 
    200         int start = text.getIndex();
    201         int current;
    202         int rangeStart;
    203         int rangeEnd;
    204         int c = CharacterIteration.current32(text);
    205         while ((current = text.getIndex()) < endPos && fSet.contains(c)) {
    206             CharacterIteration.next32(text);
    207             c = CharacterIteration.current32(text);
    208         }
    209         rangeStart = start;
    210         rangeEnd = current;
    211 
    212         // if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
    213         // TODO: Why does icu4c have this?
    214         result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
    215         text.setIndex(current);
    216 
    217         return result;
    218     }
    219 
    220     void setCharacters(UnicodeSet set) {
    221         fSet = new UnicodeSet(set);
    222         fSet.compact();
    223     }
    224 
    225     /**
    226      * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
    227      *
    228      * @param text A UText representing the text
    229      * @param rangeStart The start of the range of dictionary characters
    230      * @param rangeEnd The end of the range of dictionary characters
    231      * @param foundBreaks Output of break positions. Positions are pushed.
    232      *                    Pre-existing contents of the output stack are unaltered.
    233      * @return The number of breaks found
    234      */
    235      abstract int divideUpDictionaryRange(CharacterIterator text,
    236                                           int               rangeStart,
    237                                           int               rangeEnd,
    238                                           DequeI            foundBreaks );
    239 }
    240