Home | History | Annotate | Download | only in text
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /*
      5  *******************************************************************************
      6  * Copyright (C) 2014, International Business Machines Corporation and         *
      7  * others. All Rights Reserved.                                                *
      8  *******************************************************************************
      9  */
     10 package android.icu.text;
     11 
     12 import java.io.IOException;
     13 import java.text.CharacterIterator;
     14 
     15 import android.icu.lang.UCharacter;
     16 import android.icu.lang.UProperty;
     17 import android.icu.lang.UScript;
     18 
     19 class ThaiBreakEngine extends DictionaryBreakEngine {
     20 
     21     // Constants for ThaiBreakIterator
     22     // How many words in a row are "good enough"?
     23     private static final byte THAI_LOOKAHEAD = 3;
     24     // Will not combine a non-word with a preceding dictionary word longer than this
     25     private static final byte THAI_ROOT_COMBINE_THRESHOLD = 3;
     26     // Will not combine a non-word that shares at least this much prefix with a
     27     // dictionary word with a preceding word
     28     private static final byte THAI_PREFIX_COMBINE_THRESHOLD = 3;
     29     // Ellision character
     30     private static final char THAI_PAIYANNOI = 0x0E2F;
     31     // Repeat character
     32     private static final char THAI_MAIYAMOK = 0x0E46;
     33     // Minimum word size
     34     private static final byte THAI_MIN_WORD = 2;
     35     // Minimum number of characters for two words
     36     private static final byte THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
     37 
     38     private DictionaryMatcher fDictionary;
     39     private static UnicodeSet fThaiWordSet;
     40     private static UnicodeSet fEndWordSet;
     41     private static UnicodeSet fBeginWordSet;
     42     private static UnicodeSet fSuffixSet;
     43     private static UnicodeSet fMarkSet;
     44 
     45     static {
     46         // Initialize UnicodeSets
     47         fThaiWordSet = new UnicodeSet();
     48         fMarkSet = new UnicodeSet();
     49         fBeginWordSet = new UnicodeSet();
     50         fSuffixSet = new UnicodeSet();
     51 
     52         fThaiWordSet.applyPattern("[[:Thai:]&[:LineBreak=SA:]]");
     53         fThaiWordSet.compact();
     54 
     55         fMarkSet.applyPattern("[[:Thai:]&[:LineBreak=SA:]&[:M:]]");
     56         fMarkSet.add(0x0020);
     57         fEndWordSet = new UnicodeSet(fThaiWordSet);
     58         fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
     59         fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
     60         fBeginWordSet.add(0x0E01, 0x0E2E); //KO KAI through HO NOKHUK
     61         fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
     62         fSuffixSet.add(THAI_PAIYANNOI);
     63         fSuffixSet.add(THAI_MAIYAMOK);
     64 
     65         // Compact for caching
     66         fMarkSet.compact();
     67         fEndWordSet.compact();
     68         fBeginWordSet.compact();
     69         fSuffixSet.compact();
     70 
     71         // Freeze the static UnicodeSet
     72         fThaiWordSet.freeze();
     73         fMarkSet.freeze();
     74         fEndWordSet.freeze();
     75         fBeginWordSet.freeze();
     76         fSuffixSet.freeze();
     77     }
     78 
     79     public ThaiBreakEngine() throws IOException {
     80         super(BreakIterator.KIND_WORD, BreakIterator.KIND_LINE);
     81         setCharacters(fThaiWordSet);
     82         // Initialize dictionary
     83         fDictionary = DictionaryData.loadDictionaryFor("Thai");
     84     }
     85 
     86     public boolean equals(Object obj) {
     87         // Normally is a singleton, but it's possible to have duplicates
     88         //   during initialization. All are equivalent.
     89         return obj instanceof ThaiBreakEngine;
     90     }
     91 
     92     public int hashCode() {
     93         return getClass().hashCode();
     94     }
     95 
     96     public boolean handles(int c, int breakType) {
     97         if (breakType == BreakIterator.KIND_WORD || breakType == BreakIterator.KIND_LINE) {
     98             int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
     99             return (script == UScript.THAI);
    100         }
    101         return false;
    102     }
    103 
    104     public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
    105             DequeI foundBreaks) {
    106 
    107         if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) {
    108             return 0;  // Not enough characters for word
    109         }
    110         int wordsFound = 0;
    111         int wordLength;
    112         PossibleWord words[] = new PossibleWord[THAI_LOOKAHEAD];
    113         for (int i = 0; i < THAI_LOOKAHEAD; i++) {
    114             words[i] = new PossibleWord();
    115         }
    116 
    117         int uc;
    118         fIter.setIndex(rangeStart);
    119         int current;
    120         while ((current = fIter.getIndex()) < rangeEnd) {
    121             wordLength = 0;
    122 
    123             //Look for candidate words at the current position
    124             int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd);
    125 
    126             // If we found exactly one, use that
    127             if (candidates == 1) {
    128                 wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(fIter);
    129                 wordsFound += 1;
    130             }
    131 
    132             // If there was more than one, see which one can take us forward the most words
    133             else if (candidates > 1) {
    134                 // If we're already at the end of the range, we're done
    135                 if (fIter.getIndex() < rangeEnd) {
    136                   foundBest:
    137                     do {
    138                         int wordsMatched = 1;
    139                         if (words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) > 0) {
    140                             if (wordsMatched < 2) {
    141                                 // Followed by another dictionary word; mark first word as a good candidate
    142                                 words[wordsFound%THAI_LOOKAHEAD].markCurrent();
    143                                 wordsMatched = 2;
    144                             }
    145 
    146                             // If we're already at the end of the range, we're done
    147                             if (fIter.getIndex() >= rangeEnd) {
    148                                 break foundBest;
    149                             }
    150 
    151                             // See if any of the possible second words is followed by a third word
    152                             do {
    153                                 // If we find a third word, stop right away
    154                                 if (words[(wordsFound+2)%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) > 0) {
    155                                     words[wordsFound%THAI_LOOKAHEAD].markCurrent();
    156                                     break foundBest;
    157                                 }
    158                             } while (words[(wordsFound+1)%THAI_LOOKAHEAD].backUp(fIter));
    159                         }
    160                     }
    161                     while (words[wordsFound%THAI_LOOKAHEAD].backUp(fIter));
    162                     // foundBest: end of loop
    163                 }
    164                 wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(fIter);
    165                 wordsFound += 1;
    166             }
    167 
    168             // We come here after having either found a word or not. We look ahead to the
    169             // next word. If it's not a dictionary word, we will combine it with the word we
    170             // just found (if there is one), but only if the preceding word does not exceed
    171             // the threshold.
    172             // The text iterator should now be positioned at the end of the word we found.
    173             if (fIter.getIndex() < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) {
    174                 // If it is a dictionary word, do nothing. If it isn't, then if there is
    175                 // no preceding word, or the non-word shares less than the minimum threshold
    176                 // of characters with a dictionary word, then scan to resynchronize
    177                 if (words[wordsFound%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) <= 0 &&
    178                         (wordLength == 0 ||
    179                                 words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
    180                     // Look for a plausible word boundary
    181                     int remaining = rangeEnd - (current + wordLength);
    182                     int pc = fIter.current();
    183                     int chars = 0;
    184                     for (;;) {
    185                         fIter.next();
    186                         uc = fIter.current();
    187                         chars += 1;
    188                         if (--remaining <= 0) {
    189                             break;
    190                         }
    191                         if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
    192                             // Maybe. See if it's in the dictionary.
    193                             // Note: In the original Apple code, checked that the next
    194                             // two characters after uc were not 0x0E4C THANTHAKHAT before
    195                             // checking the dictionary. That is just a performance filter,
    196                             // but it's not clear it's faster than checking the trie
    197                             int candidate = words[(wordsFound + 1) %THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd);
    198                             fIter.setIndex(current + wordLength + chars);
    199                             if (candidate > 0) {
    200                                 break;
    201                             }
    202                         }
    203                         pc = uc;
    204                     }
    205 
    206                     // Bump the word count if there wasn't already one
    207                     if (wordLength <= 0) {
    208                         wordsFound += 1;
    209                     }
    210 
    211                     // Update the length with the passed-over characters
    212                     wordLength += chars;
    213                 } else {
    214                     // Backup to where we were for next iteration
    215                     fIter.setIndex(current+wordLength);
    216                 }
    217             }
    218 
    219             // Never stop before a combining mark.
    220             int currPos;
    221             while ((currPos = fIter.getIndex()) < rangeEnd && fMarkSet.contains(fIter.current())) {
    222                 fIter.next();
    223                 wordLength += fIter.getIndex() - currPos;
    224             }
    225 
    226             // Look ahead for possible suffixes if a dictionary word does not follow.
    227             // We do this in code rather than using a rule so that the heuristic
    228             // resynch continues to function. For example, one of the suffix characters
    229             // could be a typo in the middle of a word.
    230             if (fIter.getIndex() < rangeEnd && wordLength > 0) {
    231                 if (words[wordsFound%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) <= 0 &&
    232                         fSuffixSet.contains(uc = fIter.current())) {
    233                     if (uc == THAI_PAIYANNOI) {
    234                         if (!fSuffixSet.contains(fIter.previous())) {
    235                             // Skip over previous end and PAIYANNOI
    236                             fIter.next();
    237                             fIter.next();
    238                             wordLength += 1;
    239                             uc = fIter.current();
    240                         } else {
    241                             // Restore prior position
    242                             fIter.next();
    243                         }
    244                     }
    245                     if (uc == THAI_MAIYAMOK) {
    246                         if (fIter.previous() != THAI_MAIYAMOK) {
    247                             // Skip over previous end and MAIYAMOK
    248                             fIter.next();
    249                             fIter.next();
    250                             wordLength += 1;
    251                         } else {
    252                             // restore prior position
    253                             fIter.next();
    254                         }
    255                     }
    256                 } else {
    257                     fIter.setIndex(current + wordLength);
    258                 }
    259             }
    260 
    261             // Did we find a word on this iteration? If so, push it on the break stack
    262             if (wordLength > 0) {
    263                 foundBreaks.push(Integer.valueOf(current + wordLength));
    264             }
    265         }
    266 
    267         // Don't return a break for the end of the dictionary range if there is one there
    268         if (foundBreaks.peek() >= rangeEnd) {
    269             foundBreaks.pop();
    270             wordsFound -= 1;
    271         }
    272 
    273         return wordsFound;
    274     }
    275 
    276 }
    277