Home | History | Annotate | Download | only in text
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4  *******************************************************************************
      5  * Copyright (C) 2014, International Business Machines Corporation and         *
      6  * others. All Rights Reserved.                                                *
      7  *******************************************************************************
      8  */
      9 package com.ibm.icu.text;
     10 
     11 import java.text.CharacterIterator;
     12 
     13 import com.ibm.icu.impl.CharacterIteration;
     14 
     15 abstract class DictionaryBreakEngine implements LanguageBreakEngine {
     16 
     17     /* Helper class for improving readability of the Thai/Lao/Khmer word break
     18      * algorithm.
     19      */
     20     static class PossibleWord {
     21         // List size, limited by the maximum number of words in the dictionary
     22         // that form a nested sequence.
     23         private final static int POSSIBLE_WORD_LIST_MAX = 20;
     24         //list of word candidate lengths, in increasing length order
     25         private int lengths[];
     26         private int count[];    // Count of candidates
     27         private int prefix;     // The longest match with a dictionary word
     28         private int offset;     // Offset in the text of these candidates
     29         private int mark;       // The preferred candidate's offset
     30         private int current;    // The candidate we're currently looking at
     31 
     32         // Default constructor
     33         public PossibleWord() {
     34             lengths = new int[POSSIBLE_WORD_LIST_MAX];
     35             count = new int[1]; // count needs to be an array of 1 so that it can be pass as reference
     36             offset = -1;
     37         }
     38 
     39         // Fill the list of candidates if needed, select the longest, and return the number found
     40         public int candidates(CharacterIterator fIter, DictionaryMatcher dict, int rangeEnd) {
     41             int start = fIter.getIndex();
     42             if (start != offset) {
     43                 offset = start;
     44                 prefix = dict.matches(fIter, rangeEnd - start, lengths, count, lengths.length);
     45                 // Dictionary leaves text after longest prefix, not longest word. Back up.
     46                 if (count[0] <= 0) {
     47                     fIter.setIndex(start);
     48                 }
     49             }
     50             if (count[0] > 0) {
     51                 fIter.setIndex(start + lengths[count[0]-1]);
     52             }
     53             current = count[0] - 1;
     54             mark = current;
     55             return count[0];
     56         }
     57 
     58         // Select the currently marked candidate, point after it in the text, and invalidate self
     59         public int acceptMarked(CharacterIterator fIter) {
     60             fIter.setIndex(offset + lengths[mark]);
     61             return lengths[mark];
     62         }
     63 
     64         // Backup from the current candidate to the next shorter one; return true if that exists
     65         // and point the text after it
     66         public boolean backUp(CharacterIterator fIter) {
     67             if (current > 0) {
     68                 fIter.setIndex(offset + lengths[--current]);
     69                 return true;
     70             }
     71             return false;
     72         }
     73 
     74         // Return the longest prefix this candidate location shares with a dictionary word
     75         public int longestPrefix() {
     76             return prefix;
     77         }
     78 
     79         // Mark the current candidate as the one we like
     80         public void markCurrent() {
     81             mark = current;
     82         }
     83     }
     84 
     85     /**
     86      *  A deque-like structure holding raw ints.
     87      *  Partial, limited implementation, only what is needed by the dictionary implementation.
     88      *  For internal use only.
     89      * @internal
     90      */
     91     static class DequeI implements Cloneable {
     92         private int[] data = new int[50];
     93         private int lastIdx = 4;   // or base of stack. Index of element.
     94         private int firstIdx = 4;  // or Top of Stack. Index of element + 1.
     95 
     96         @Override
     97         public Object clone() throws CloneNotSupportedException {
     98             DequeI result = (DequeI)super.clone();
     99             result.data = data.clone();
    100             return result;
    101         }
    102 
    103         int size() {
    104             return firstIdx - lastIdx;
    105         }
    106 
    107         boolean isEmpty() {
    108             return size() == 0;
    109         }
    110 
    111         private void grow() {
    112             int[] newData = new int[data.length * 2];
    113             System.arraycopy(data,  0,  newData,  0, data.length);
    114             data = newData;
    115         }
    116 
    117         void offer(int v) {
    118             // Note that the actual use cases of offer() add at most one element.
    119             //   We make no attempt to handle more than a few.
    120             assert lastIdx > 0;
    121             data[--lastIdx] = v;
    122         }
    123 
    124         void push(int v) {
    125             if (firstIdx >= data.length) {
    126                 grow();
    127             }
    128             data[firstIdx++] = v;
    129         }
    130 
    131         int pop() {
    132             assert size() > 0;
    133             return data[--firstIdx];
    134         }
    135 
    136         int peek() {
    137             assert size() > 0;
    138             return data[firstIdx - 1];
    139         }
    140 
    141         int peekLast() {
    142             assert size() > 0;
    143             return data[lastIdx];
    144         }
    145 
    146         int pollLast() {
    147             assert size() > 0;
    148             return data[lastIdx++];
    149         }
    150 
    151         boolean contains(int v) {
    152             for (int i=lastIdx; i< firstIdx; i++) {
    153                 if (data[i] == v) {
    154                     return true;
    155                 }
    156             }
    157             return false;
    158         }
    159 
    160         int elementAt(int i) {
    161             assert i < size();
    162             return data[lastIdx + i];
    163         }
    164 
    165         void removeAllElements() {
    166             lastIdx = firstIdx = 4;
    167         }
    168     }
    169 
    170     UnicodeSet fSet = new UnicodeSet();
    171 
    172     /**
    173      *  Constructor
    174      */
    175     public DictionaryBreakEngine() {
    176     }
    177 
    178     @Override
    179     public boolean handles(int c) {
    180         return fSet.contains(c);        // we recognize the character
    181     }
    182 
    183     @Override
    184     public int findBreaks(CharacterIterator text, int startPos, int endPos,
    185             DequeI foundBreaks) {
    186         int result = 0;
    187 
    188          // Find the span of characters included in the set.
    189          //   The span to break begins at the current position int the text, and
    190          //   extends towards the start or end of the text, depending on 'reverse'.
    191 
    192         int start = text.getIndex();
    193         int current;
    194         int rangeStart;
    195         int rangeEnd;
    196         int c = CharacterIteration.current32(text);
    197         while ((current = text.getIndex()) < endPos && fSet.contains(c)) {
    198             CharacterIteration.next32(text);
    199             c = CharacterIteration.current32(text);
    200         }
    201         rangeStart = start;
    202         rangeEnd = current;
    203 
    204         result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
    205         text.setIndex(current);
    206 
    207         return result;
    208     }
    209 
    210     void setCharacters(UnicodeSet set) {
    211         fSet = new UnicodeSet(set);
    212         fSet.compact();
    213     }
    214 
    215     /**
    216      * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
    217      *
    218      * @param text A UText representing the text
    219      * @param rangeStart The start of the range of dictionary characters
    220      * @param rangeEnd The end of the range of dictionary characters
    221      * @param foundBreaks Output of break positions. Positions are pushed.
    222      *                    Pre-existing contents of the output stack are unaltered.
    223      * @return The number of breaks found
    224      */
    225      abstract int divideUpDictionaryRange(CharacterIterator text,
    226                                           int               rangeStart,
    227                                           int               rangeEnd,
    228                                           DequeI            foundBreaks );
    229 }
    230