Home | History | Annotate | Download | only in common
      1 /**
      2  *******************************************************************************
      3  * Copyright (C) 2006-2008, International Business Machines Corporation and others. *
      4  * All Rights Reserved.                                                        *
      5  *******************************************************************************
      6  */
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #if !UCONFIG_NO_BREAK_ITERATION
     11 
     12 #include "brkeng.h"
     13 #include "dictbe.h"
     14 #include "unicode/uniset.h"
     15 #include "unicode/chariter.h"
     16 #include "unicode/ubrk.h"
     17 #include "uvector.h"
     18 #include "triedict.h"
     19 #include "uassert.h"
     20 #include "unicode/normlzr.h"
     21 #include "cmemory.h"
     22 
     23 U_NAMESPACE_BEGIN
     24 
     25 /*
     26  ******************************************************************
     27  */
     28 
     29 /*DictionaryBreakEngine::DictionaryBreakEngine() {
     30     fTypes = 0;
     31 }*/
     32 
     33 DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) {
     34     fTypes = breakTypes;
     35 }
     36 
     37 DictionaryBreakEngine::~DictionaryBreakEngine() {
     38 }
     39 
     40 UBool
     41 DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const {
     42     return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)
     43             && fSet.contains(c));
     44 }
     45 
     46 int32_t
     47 DictionaryBreakEngine::findBreaks( UText *text,
     48                                  int32_t startPos,
     49                                  int32_t endPos,
     50                                  UBool reverse,
     51                                  int32_t breakType,
     52                                  UStack &foundBreaks ) const {
     53     int32_t result = 0;
     54 
     55     // Find the span of characters included in the set.
     56     int32_t start = (int32_t)utext_getNativeIndex(text);
     57     int32_t current;
     58     int32_t rangeStart;
     59     int32_t rangeEnd;
     60     UChar32 c = utext_current32(text);
     61     if (reverse) {
     62         UBool   isDict = fSet.contains(c);
     63         while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) {
     64             c = utext_previous32(text);
     65             isDict = fSet.contains(c);
     66         }
     67         rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1);
     68         rangeEnd = start + 1;
     69     }
     70     else {
     71         while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
     72             utext_next32(text);         // TODO:  recast loop for postincrement
     73             c = utext_current32(text);
     74         }
     75         rangeStart = start;
     76         rangeEnd = current;
     77     }
     78     if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
     79         result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
     80         utext_setNativeIndex(text, current);
     81     }
     82 
     83     return result;
     84 }
     85 
     86 void
     87 DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
     88     fSet = set;
     89     // Compact for caching
     90     fSet.compact();
     91 }
     92 
     93 /*void
     94 DictionaryBreakEngine::setBreakTypes( uint32_t breakTypes ) {
     95     fTypes = breakTypes;
     96 }*/
     97 
     98 /*
     99  ******************************************************************
    100  */
    101 
    102 
    103 // Helper class for improving readability of the Thai word break
    104 // algorithm. The implementation is completely inline.
    105 
    106 // List size, limited by the maximum number of words in the dictionary
    107 // that form a nested sequence.
    108 #define POSSIBLE_WORD_LIST_MAX 20
    109 
    110 class PossibleWord {
    111  private:
    112   // list of word candidate lengths, in increasing length order
    113   int32_t   lengths[POSSIBLE_WORD_LIST_MAX];
    114   int       count;      // Count of candidates
    115   int32_t   prefix;     // The longest match with a dictionary word
    116   int32_t   offset;     // Offset in the text of these candidates
    117   int       mark;       // The preferred candidate's offset
    118   int       current;    // The candidate we're currently looking at
    119 
    120  public:
    121   PossibleWord();
    122   ~PossibleWord();
    123 
    124   // Fill the list of candidates if needed, select the longest, and return the number found
    125   int       candidates( UText *text, const TrieWordDictionary *dict, int32_t rangeEnd );
    126 
    127   // Select the currently marked candidate, point after it in the text, and invalidate self
    128   int32_t   acceptMarked( UText *text );
    129 
    130   // Back up from the current candidate to the next shorter one; return TRUE if that exists
    131   // and point the text after it
    132   UBool     backUp( UText *text );
    133 
    134   // Return the longest prefix this candidate location shares with a dictionary word
    135   int32_t   longestPrefix();
    136 
    137   // Mark the current candidate as the one we like
    138   void      markCurrent();
    139 };
    140 
    141 inline
    142 PossibleWord::PossibleWord() {
    143     offset = -1;
    144 }
    145 
    146 inline
    147 PossibleWord::~PossibleWord() {
    148 }
    149 
    150 inline int
    151 PossibleWord::candidates( UText *text, const TrieWordDictionary *dict, int32_t rangeEnd ) {
    152     // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
    153     int32_t start = (int32_t)utext_getNativeIndex(text);
    154     if (start != offset) {
    155         offset = start;
    156         prefix = dict->matches(text, rangeEnd-start, lengths, count, sizeof(lengths)/sizeof(lengths[0]));
    157         // Dictionary leaves text after longest prefix, not longest word. Back up.
    158         if (count <= 0) {
    159             utext_setNativeIndex(text, start);
    160         }
    161     }
    162     if (count > 0) {
    163         utext_setNativeIndex(text, start+lengths[count-1]);
    164     }
    165     current = count-1;
    166     mark = current;
    167     return count;
    168 }
    169 
    170 inline int32_t
    171 PossibleWord::acceptMarked( UText *text ) {
    172     utext_setNativeIndex(text, offset + lengths[mark]);
    173     return lengths[mark];
    174 }
    175 
    176 inline UBool
    177 PossibleWord::backUp( UText *text ) {
    178     if (current > 0) {
    179         utext_setNativeIndex(text, offset + lengths[--current]);
    180         return TRUE;
    181     }
    182     return FALSE;
    183 }
    184 
    185 inline int32_t
    186 PossibleWord::longestPrefix() {
    187     return prefix;
    188 }
    189 
    190 inline void
    191 PossibleWord::markCurrent() {
    192     mark = current;
    193 }
    194 
    195 // How many words in a row are "good enough"?
    196 #define THAI_LOOKAHEAD 3
    197 
    198 // Will not combine a non-word with a preceding dictionary word longer than this
    199 #define THAI_ROOT_COMBINE_THRESHOLD 3
    200 
    201 // Will not combine a non-word that shares at least this much prefix with a
    202 // dictionary word, with a preceding word
    203 #define THAI_PREFIX_COMBINE_THRESHOLD 3
    204 
    205 // Ellision character
    206 #define THAI_PAIYANNOI 0x0E2F
    207 
    208 // Repeat character
    209 #define THAI_MAIYAMOK 0x0E46
    210 
    211 // Minimum word size
    212 #define THAI_MIN_WORD 2
    213 
    214 // Minimum number of characters for two words
    215 #define THAI_MIN_WORD_SPAN (THAI_MIN_WORD * 2)
    216 
    217 ThaiBreakEngine::ThaiBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status)
    218     : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
    219       fDictionary(adoptDictionary)
    220 {
    221     fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
    222     if (U_SUCCESS(status)) {
    223         setCharacters(fThaiWordSet);
    224     }
    225     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
    226     fMarkSet.add(0x0020);
    227     fEndWordSet = fThaiWordSet;
    228     fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
    229     fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
    230     fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
    231     fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
    232     fSuffixSet.add(THAI_PAIYANNOI);
    233     fSuffixSet.add(THAI_MAIYAMOK);
    234 
    235     // Compact for caching.
    236     fMarkSet.compact();
    237     fEndWordSet.compact();
    238     fBeginWordSet.compact();
    239     fSuffixSet.compact();
    240 }
    241 
    242 ThaiBreakEngine::~ThaiBreakEngine() {
    243     delete fDictionary;
    244 }
    245 
    246 int32_t
    247 ThaiBreakEngine::divideUpDictionaryRange( UText *text,
    248                                                 int32_t rangeStart,
    249                                                 int32_t rangeEnd,
    250                                                 UStack &foundBreaks ) const {
    251     if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) {
    252         return 0;       // Not enough characters for two words
    253     }
    254 
    255     uint32_t wordsFound = 0;
    256     int32_t wordLength;
    257     int32_t current;
    258     UErrorCode status = U_ZERO_ERROR;
    259     PossibleWord words[THAI_LOOKAHEAD];
    260     UChar32 uc;
    261 
    262     utext_setNativeIndex(text, rangeStart);
    263 
    264     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
    265         wordLength = 0;
    266 
    267         // Look for candidate words at the current position
    268         int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    269 
    270         // If we found exactly one, use that
    271         if (candidates == 1) {
    272             wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(text);
    273             wordsFound += 1;
    274         }
    275 
    276         // If there was more than one, see which one can take us forward the most words
    277         else if (candidates > 1) {
    278             // If we're already at the end of the range, we're done
    279             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
    280                 goto foundBest;
    281             }
    282             do {
    283                 int wordsMatched = 1;
    284                 if (words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
    285                     if (wordsMatched < 2) {
    286                         // Followed by another dictionary word; mark first word as a good candidate
    287                         words[wordsFound%THAI_LOOKAHEAD].markCurrent();
    288                         wordsMatched = 2;
    289                     }
    290 
    291                     // If we're already at the end of the range, we're done
    292                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
    293                         goto foundBest;
    294                     }
    295 
    296                     // See if any of the possible second words is followed by a third word
    297                     do {
    298                         // If we find a third word, stop right away
    299                         if (words[(wordsFound+2)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
    300                             words[wordsFound%THAI_LOOKAHEAD].markCurrent();
    301                             goto foundBest;
    302                         }
    303                     }
    304                     while (words[(wordsFound+1)%THAI_LOOKAHEAD].backUp(text));
    305                 }
    306             }
    307             while (words[wordsFound%THAI_LOOKAHEAD].backUp(text));
    308 foundBest:
    309             wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(text);
    310             wordsFound += 1;
    311         }
    312 
    313         // We come here after having either found a word or not. We look ahead to the
    314         // next word. If it's not a dictionary word, we will combine it withe the word we
    315         // just found (if there is one), but only if the preceding word does not exceed
    316         // the threshold.
    317         // The text iterator should now be positioned at the end of the word we found.
    318         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) {
    319             // if it is a dictionary word, do nothing. If it isn't, then if there is
    320             // no preceding word, or the non-word shares less than the minimum threshold
    321             // of characters with a dictionary word, then scan to resynchronize
    322             if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
    323                   && (wordLength == 0
    324                       || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
    325                 // Look for a plausible word boundary
    326                 //TODO: This section will need a rework for UText.
    327                 int32_t remaining = rangeEnd - (current+wordLength);
    328                 UChar32 pc = utext_current32(text);
    329                 int32_t chars = 0;
    330                 for (;;) {
    331                     utext_next32(text);
    332                     uc = utext_current32(text);
    333                     // TODO: Here we're counting on the fact that the SA languages are all
    334                     // in the BMP. This should get fixed with the UText rework.
    335                     chars += 1;
    336                     if (--remaining <= 0) {
    337                         break;
    338                     }
    339                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
    340                         // Maybe. See if it's in the dictionary.
    341                         // NOTE: In the original Apple code, checked that the next
    342                         // two characters after uc were not 0x0E4C THANTHAKHAT before
    343                         // checking the dictionary. That is just a performance filter,
    344                         // but it's not clear it's faster than checking the trie.
    345                         int candidates = words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    346                         utext_setNativeIndex(text, current+wordLength+chars);
    347                         if (candidates > 0) {
    348                             break;
    349                         }
    350                     }
    351                     pc = uc;
    352                 }
    353 
    354                 // Bump the word count if there wasn't already one
    355                 if (wordLength <= 0) {
    356                     wordsFound += 1;
    357                 }
    358 
    359                 // Update the length with the passed-over characters
    360                 wordLength += chars;
    361             }
    362             else {
    363                 // Back up to where we were for next iteration
    364                 utext_setNativeIndex(text, current+wordLength);
    365             }
    366         }
    367 
    368         // Never stop before a combining mark.
    369         int32_t currPos;
    370         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
    371             utext_next32(text);
    372             wordLength += (int32_t)utext_getNativeIndex(text) - currPos;
    373         }
    374 
    375         // Look ahead for possible suffixes if a dictionary word does not follow.
    376         // We do this in code rather than using a rule so that the heuristic
    377         // resynch continues to function. For example, one of the suffix characters
    378         // could be a typo in the middle of a word.
    379         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
    380             if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
    381                 && fSuffixSet.contains(uc = utext_current32(text))) {
    382                 if (uc == THAI_PAIYANNOI) {
    383                     if (!fSuffixSet.contains(utext_previous32(text))) {
    384                         // Skip over previous end and PAIYANNOI
    385                         utext_next32(text);
    386                         utext_next32(text);
    387                         wordLength += 1;            // Add PAIYANNOI to word
    388                         uc = utext_current32(text);     // Fetch next character
    389                     }
    390                     else {
    391                         // Restore prior position
    392                         utext_next32(text);
    393                     }
    394                 }
    395                 if (uc == THAI_MAIYAMOK) {
    396                     if (utext_previous32(text) != THAI_MAIYAMOK) {
    397                         // Skip over previous end and MAIYAMOK
    398                         utext_next32(text);
    399                         utext_next32(text);
    400                         wordLength += 1;            // Add MAIYAMOK to word
    401                     }
    402                     else {
    403                         // Restore prior position
    404                         utext_next32(text);
    405                     }
    406                 }
    407             }
    408             else {
    409                 utext_setNativeIndex(text, current+wordLength);
    410             }
    411         }
    412 
    413         // Did we find a word on this iteration? If so, push it on the break stack
    414         if (wordLength > 0) {
    415             foundBreaks.push((current+wordLength), status);
    416         }
    417     }
    418 
    419     // Don't return a break for the end of the dictionary range if there is one there.
    420     if (foundBreaks.peeki() >= rangeEnd) {
    421         (void) foundBreaks.popi();
    422         wordsFound -= 1;
    423     }
    424 
    425     return wordsFound;
    426 }
    427 
    428 /*
    429  ******************************************************************
    430  * CjkBreakEngine
    431  */
    432 static const uint32_t kuint32max = 0xFFFFFFFF;
    433 CjkBreakEngine::CjkBreakEngine(const TrieWordDictionary *adoptDictionary, LanguageType type, UErrorCode &status)
    434 : DictionaryBreakEngine(1<<UBRK_WORD), fDictionary(adoptDictionary){
    435     if (!adoptDictionary->getValued()) {
    436         status = U_ILLEGAL_ARGUMENT_ERROR;
    437         return;
    438     }
    439 
    440     // Korean dictionary only includes Hangul syllables
    441     fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
    442     fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
    443     fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
    444     fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
    445 
    446     if (U_SUCCESS(status)) {
    447         // handle Korean and Japanese/Chinese using different dictionaries
    448         if (type == kKorean) {
    449             setCharacters(fHangulWordSet);
    450         } else { //Chinese and Japanese
    451             UnicodeSet cjSet;
    452             cjSet.addAll(fHanWordSet);
    453             cjSet.addAll(fKatakanaWordSet);
    454             cjSet.addAll(fHiraganaWordSet);
    455             cjSet.add(UNICODE_STRING_SIMPLE("\\uff70\\u30fc"));
    456             setCharacters(cjSet);
    457         }
    458     }
    459 }
    460 
    461 CjkBreakEngine::~CjkBreakEngine(){
    462     delete fDictionary;
    463 }
    464 
    465 // The katakanaCost values below are based on the length frequencies of all
    466 // katakana phrases in the dictionary
    467 static const int kMaxKatakanaLength = 8;
    468 static const int kMaxKatakanaGroupLength = 20;
    469 static const uint32_t maxSnlp = 255;
    470 
    471 static inline uint32_t getKatakanaCost(int wordLength){
    472     //TODO: fill array with actual values from dictionary!
    473     static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
    474                                        = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
    475     return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
    476 }
    477 
    478 static inline bool isKatakana(uint16_t value) {
    479     return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) ||
    480             (value >= 0xFF66u && value <= 0xFF9fu);
    481 }
    482 
    483 // A very simple helper class to streamline the buffer handling in
    484 // divideUpDictionaryRange.
    485 template<class T, size_t N>
    486 class AutoBuffer {
    487  public:
    488   AutoBuffer(size_t size) : buffer(stackBuffer), capacity(N) {
    489     if (size > N) {
    490       buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));
    491       capacity = size;
    492     }
    493   }
    494   ~AutoBuffer() {
    495     if (buffer != stackBuffer)
    496       uprv_free(buffer);
    497   }
    498 #if 0
    499   T* operator& () {
    500     return buffer;
    501   }
    502 #endif
    503   T* elems() {
    504     return buffer;
    505   }
    506   const T& operator[] (size_t i) const {
    507     return buffer[i];
    508   }
    509   T& operator[] (size_t i) {
    510     return buffer[i];
    511   }
    512 
    513   // resize without copy
    514   void resize(size_t size) {
    515     if (size <= capacity)
    516       return;
    517     if (buffer != stackBuffer)
    518       uprv_free(buffer);
    519     buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));
    520     capacity = size;
    521   }
    522  private:
    523   T stackBuffer[N];
    524   T* buffer;
    525   AutoBuffer();
    526   size_t capacity;
    527 };
    528 
    529 
    530 /*
    531  * @param text A UText representing the text
    532  * @param rangeStart The start of the range of dictionary characters
    533  * @param rangeEnd The end of the range of dictionary characters
    534  * @param foundBreaks Output of C array of int32_t break positions, or 0
    535  * @return The number of breaks found
    536  */
    537 int32_t
    538 CjkBreakEngine::divideUpDictionaryRange( UText *text,
    539         int32_t rangeStart,
    540         int32_t rangeEnd,
    541         UStack &foundBreaks ) const {
    542     if (rangeStart >= rangeEnd) {
    543         return 0;
    544     }
    545 
    546     const size_t defaultInputLength = 80;
    547     size_t inputLength = rangeEnd - rangeStart;
    548     AutoBuffer<UChar, defaultInputLength> charString(inputLength);
    549 
    550     // Normalize the input string and put it in normalizedText.
    551     // The map from the indices of the normalized input to the raw
    552     // input is kept in charPositions.
    553     UErrorCode status = U_ZERO_ERROR;
    554     utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, &status);
    555     if (U_FAILURE(status))
    556         return 0;
    557 
    558     UnicodeString inputString(charString.elems(), inputLength);
    559     UNormalizationMode norm_mode = UNORM_NFKC;
    560     UBool isNormalized =
    561         Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES ||
    562         Normalizer::isNormalized(inputString, norm_mode, status);
    563 
    564     AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1);
    565     int numChars = 0;
    566     UText normalizedText = UTEXT_INITIALIZER;
    567     // Needs to be declared here because normalizedText holds onto its buffer.
    568     UnicodeString normalizedString;
    569     if (isNormalized) {
    570         int32_t index = 0;
    571         charPositions[0] = 0;
    572         while(index < inputString.length()) {
    573             index = inputString.moveIndex32(index, 1);
    574             charPositions[++numChars] = index;
    575         }
    576         utext_openUnicodeString(&normalizedText, &inputString, &status);
    577     }
    578     else {
    579         Normalizer::normalize(inputString, norm_mode, 0, normalizedString, status);
    580         if (U_FAILURE(status))
    581             return 0;
    582         charPositions.resize(normalizedString.length() + 1);
    583         Normalizer normalizer(charString.elems(), inputLength, norm_mode);
    584         int32_t index = 0;
    585         charPositions[0] = 0;
    586         while(index < normalizer.endIndex()){
    587             UChar32 uc = normalizer.next();
    588             charPositions[++numChars] = index = normalizer.getIndex();
    589         }
    590         utext_openUnicodeString(&normalizedText, &normalizedString, &status);
    591     }
    592 
    593     if (U_FAILURE(status))
    594         return 0;
    595 
    596     // From this point on, all the indices refer to the indices of
    597     // the normalized input string.
    598 
    599     // bestSnlp[i] is the snlp of the best segmentation of the first i
    600     // characters in the range to be matched.
    601     AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1);
    602     bestSnlp[0] = 0;
    603     for(int i=1; i<=numChars; i++){
    604         bestSnlp[i] = kuint32max;
    605     }
    606 
    607     // prev[i] is the index of the last CJK character in the previous word in
    608     // the best segmentation of the first i characters.
    609     AutoBuffer<int, defaultInputLength> prev(numChars + 1);
    610     for(int i=0; i<=numChars; i++){
    611         prev[i] = -1;
    612     }
    613 
    614     const size_t maxWordSize = 20;
    615     AutoBuffer<uint16_t, maxWordSize> values(numChars);
    616     AutoBuffer<int32_t, maxWordSize> lengths(numChars);
    617 
    618     // Dynamic programming to find the best segmentation.
    619     bool is_prev_katakana = false;
    620     for (int i = 0; i < numChars; ++i) {
    621         //utext_setNativeIndex(text, rangeStart + i);
    622         utext_setNativeIndex(&normalizedText, i);
    623         if (bestSnlp[i] == kuint32max)
    624             continue;
    625 
    626         int count;
    627         // limit maximum word length matched to size of current substring
    628         int maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSize: numChars - i;
    629 
    630         fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems());
    631 
    632         // if there are no single character matches found in the dictionary
    633         // starting with this charcter, treat character as a 1-character word
    634         // with the highest value possible, i.e. the least likely to occur.
    635         // Exclude Korean characters from this treatment, as they should be left
    636         // together by default.
    637         if((count == 0 || lengths[0] != 1) &&
    638                 !fHangulWordSet.contains(utext_current32(&normalizedText))){
    639             values[count] = maxSnlp;
    640             lengths[count++] = 1;
    641         }
    642 
    643         for (int j = 0; j < count; j++){
    644             //U_ASSERT(values[j] >= 0 && values[j] <= maxSnlp);
    645             uint32_t newSnlp = bestSnlp[i] + values[j];
    646             if (newSnlp < bestSnlp[lengths[j] + i]) {
    647                 bestSnlp[lengths[j] + i] = newSnlp;
    648                 prev[lengths[j] + i] = i;
    649             }
    650         }
    651 
    652         // In Japanese,
    653         // Katakana word in single character is pretty rare. So we apply
    654         // the following heuristic to Katakana: any continuous run of Katakana
    655         // characters is considered a candidate word with a default cost
    656         // specified in the katakanaCost table according to its length.
    657         //utext_setNativeIndex(text, rangeStart + i);
    658         utext_setNativeIndex(&normalizedText, i);
    659         bool is_katakana = isKatakana(utext_current32(&normalizedText));
    660         if (!is_prev_katakana && is_katakana) {
    661             int j = i + 1;
    662             utext_next32(&normalizedText);
    663             // Find the end of the continuous run of Katakana characters
    664             while (j < numChars && (j - i) < kMaxKatakanaGroupLength &&
    665                     isKatakana(utext_current32(&normalizedText))) {
    666                 utext_next32(&normalizedText);
    667                 ++j;
    668             }
    669             if ((j - i) < kMaxKatakanaGroupLength) {
    670                 uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i);
    671                 if (newSnlp < bestSnlp[j]) {
    672                     bestSnlp[j] = newSnlp;
    673                     prev[j] = i;
    674                 }
    675             }
    676         }
    677         is_prev_katakana = is_katakana;
    678     }
    679 
    680     // Start pushing the optimal offset index into t_boundary (t for tentative).
    681     // prev[numChars] is guaranteed to be meaningful.
    682     // We'll first push in the reverse order, i.e.,
    683     // t_boundary[0] = numChars, and afterwards do a swap.
    684     AutoBuffer<int, maxWordSize> t_boundary(numChars + 1);
    685 
    686     int numBreaks = 0;
    687     // No segmentation found, set boundary to end of range
    688     if (bestSnlp[numChars] == kuint32max) {
    689         t_boundary[numBreaks++] = numChars;
    690     } else {
    691         for (int i = numChars; i > 0; i = prev[i]){
    692             t_boundary[numBreaks++] = i;
    693 
    694         }
    695         U_ASSERT(prev[t_boundary[numBreaks-1]] == 0);
    696     }
    697 
    698     // Reverse offset index in t_boundary.
    699     // Don't add a break for the start of the dictionary range if there is one
    700     // there already.
    701     if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
    702         t_boundary[numBreaks++] = 0;
    703     }
    704 
    705     // Now that we're done, convert positions in t_bdry[] (indices in
    706     // the normalized input string) back to indices in the raw input string
    707     // while reversing t_bdry and pushing values to foundBreaks.
    708     for (int i = numBreaks-1; i >= 0; i--) {
    709         foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status);
    710     }
    711 
    712     utext_close(&normalizedText);
    713     return numBreaks;
    714 }
    715 
    716 U_NAMESPACE_END
    717 
    718 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
    719