Home | History | Annotate | Download | only in common
      1 /**
      2  *******************************************************************************
      3  * Copyright (C) 2006-2008, International Business Machines Corporation and others. *
      4  * All Rights Reserved.                                                        *
      5  *******************************************************************************
      6  */
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #if !UCONFIG_NO_BREAK_ITERATION
     11 
     12 #include "brkeng.h"
     13 #include "dictbe.h"
     14 #include "unicode/uniset.h"
     15 #include "unicode/chariter.h"
     16 #include "unicode/ubrk.h"
     17 #include "uvector.h"
     18 #include "triedict.h"
     19 #include "uassert.h"
     20 #include "unicode/normlzr.h"
     21 #include "cmemory.h"
     22 
     23 #include <stdio.h>
     24 
     25 U_NAMESPACE_BEGIN
     26 
     27 /*
     28  ******************************************************************
     29  */
     30 
     31 /*DictionaryBreakEngine::DictionaryBreakEngine() {
     32     fTypes = 0;
     33 }*/
     34 
     35 DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) {
     36     fTypes = breakTypes;
     37 }
     38 
     39 DictionaryBreakEngine::~DictionaryBreakEngine() {
     40 }
     41 
     42 UBool
     43 DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const {
     44     return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)
     45             && fSet.contains(c));
     46 }
     47 
     48 int32_t
     49 DictionaryBreakEngine::findBreaks( UText *text,
     50                                  int32_t startPos,
     51                                  int32_t endPos,
     52                                  UBool reverse,
     53                                  int32_t breakType,
     54                                  UStack &foundBreaks ) const {
     55     int32_t result = 0;
     56 
     57     // Find the span of characters included in the set.
     58     int32_t start = (int32_t)utext_getNativeIndex(text);
     59     int32_t current;
     60     int32_t rangeStart;
     61     int32_t rangeEnd;
     62     UChar32 c = utext_current32(text);
     63     if (reverse) {
     64         UBool   isDict = fSet.contains(c);
     65         while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) {
     66             c = utext_previous32(text);
     67             isDict = fSet.contains(c);
     68         }
     69         rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1);
     70         rangeEnd = start + 1;
     71     }
     72     else {
     73         while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
     74             utext_next32(text);         // TODO:  recast loop for postincrement
     75             c = utext_current32(text);
     76         }
     77         rangeStart = start;
     78         rangeEnd = current;
     79     }
     80     if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
     81         result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
     82         utext_setNativeIndex(text, current);
     83     }
     84 
     85     return result;
     86 }
     87 
     88 void
     89 DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
     90     fSet = set;
     91     // Compact for caching
     92     fSet.compact();
     93 }
     94 
     95 /*void
     96 DictionaryBreakEngine::setBreakTypes( uint32_t breakTypes ) {
     97     fTypes = breakTypes;
     98 }*/
     99 
    100 /*
    101  ******************************************************************
    102  */
    103 
    104 
    105 // Helper class for improving readability of the Thai word break
    106 // algorithm. The implementation is completely inline.
    107 
    108 // List size, limited by the maximum number of words in the dictionary
    109 // that form a nested sequence.
    110 #define POSSIBLE_WORD_LIST_MAX 20
    111 
    112 class PossibleWord {
    113  private:
    114   // list of word candidate lengths, in increasing length order
    115   int32_t   lengths[POSSIBLE_WORD_LIST_MAX];
    116   int       count;      // Count of candidates
    117   int32_t   prefix;     // The longest match with a dictionary word
    118   int32_t   offset;     // Offset in the text of these candidates
    119   int       mark;       // The preferred candidate's offset
    120   int       current;    // The candidate we're currently looking at
    121 
    122  public:
    123   PossibleWord();
    124   ~PossibleWord();
    125 
    126   // Fill the list of candidates if needed, select the longest, and return the number found
    127   int       candidates( UText *text, const TrieWordDictionary *dict, int32_t rangeEnd );
    128 
    129   // Select the currently marked candidate, point after it in the text, and invalidate self
    130   int32_t   acceptMarked( UText *text );
    131 
    132   // Back up from the current candidate to the next shorter one; return TRUE if that exists
    133   // and point the text after it
    134   UBool     backUp( UText *text );
    135 
    136   // Return the longest prefix this candidate location shares with a dictionary word
    137   int32_t   longestPrefix();
    138 
    139   // Mark the current candidate as the one we like
    140   void      markCurrent();
    141 };
    142 
    143 inline
    144 PossibleWord::PossibleWord() {
    145     offset = -1;
    146 }
    147 
    148 inline
    149 PossibleWord::~PossibleWord() {
    150 }
    151 
    152 inline int
    153 PossibleWord::candidates( UText *text, const TrieWordDictionary *dict, int32_t rangeEnd ) {
    154     // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
    155     int32_t start = (int32_t)utext_getNativeIndex(text);
    156     if (start != offset) {
    157         offset = start;
    158         prefix = dict->matches(text, rangeEnd-start, lengths, count, sizeof(lengths)/sizeof(lengths[0]));
    159         // Dictionary leaves text after longest prefix, not longest word. Back up.
    160         if (count <= 0) {
    161             utext_setNativeIndex(text, start);
    162         }
    163     }
    164     if (count > 0) {
    165         utext_setNativeIndex(text, start+lengths[count-1]);
    166     }
    167     current = count-1;
    168     mark = current;
    169     return count;
    170 }
    171 
    172 inline int32_t
    173 PossibleWord::acceptMarked( UText *text ) {
    174     utext_setNativeIndex(text, offset + lengths[mark]);
    175     return lengths[mark];
    176 }
    177 
    178 inline UBool
    179 PossibleWord::backUp( UText *text ) {
    180     if (current > 0) {
    181         utext_setNativeIndex(text, offset + lengths[--current]);
    182         return TRUE;
    183     }
    184     return FALSE;
    185 }
    186 
    187 inline int32_t
    188 PossibleWord::longestPrefix() {
    189     return prefix;
    190 }
    191 
    192 inline void
    193 PossibleWord::markCurrent() {
    194     mark = current;
    195 }
    196 
    197 // How many words in a row are "good enough"?
    198 #define THAI_LOOKAHEAD 3
    199 
    200 // Will not combine a non-word with a preceding dictionary word longer than this
    201 #define THAI_ROOT_COMBINE_THRESHOLD 3
    202 
    203 // Will not combine a non-word that shares at least this much prefix with a
    204 // dictionary word, with a preceding word
    205 #define THAI_PREFIX_COMBINE_THRESHOLD 3
    206 
    207 // Ellision character
    208 #define THAI_PAIYANNOI 0x0E2F
    209 
    210 // Repeat character
    211 #define THAI_MAIYAMOK 0x0E46
    212 
    213 // Minimum word size
    214 #define THAI_MIN_WORD 2
    215 
    216 // Minimum number of characters for two words
    217 #define THAI_MIN_WORD_SPAN (THAI_MIN_WORD * 2)
    218 
    219 ThaiBreakEngine::ThaiBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status)
    220     : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
    221       fDictionary(adoptDictionary)
    222 {
    223     fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
    224     if (U_SUCCESS(status)) {
    225         setCharacters(fThaiWordSet);
    226     }
    227     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
    228     fMarkSet.add(0x0020);
    229     fEndWordSet = fThaiWordSet;
    230     fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
    231     fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
    232     fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
    233     fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
    234     fSuffixSet.add(THAI_PAIYANNOI);
    235     fSuffixSet.add(THAI_MAIYAMOK);
    236 
    237     // Compact for caching.
    238     fMarkSet.compact();
    239     fEndWordSet.compact();
    240     fBeginWordSet.compact();
    241     fSuffixSet.compact();
    242 }
    243 
    244 ThaiBreakEngine::~ThaiBreakEngine() {
    245     delete fDictionary;
    246 }
    247 
    248 int32_t
    249 ThaiBreakEngine::divideUpDictionaryRange( UText *text,
    250                                                 int32_t rangeStart,
    251                                                 int32_t rangeEnd,
    252                                                 UStack &foundBreaks ) const {
    253     if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) {
    254         return 0;       // Not enough characters for two words
    255     }
    256 
    257     uint32_t wordsFound = 0;
    258     int32_t wordLength;
    259     int32_t current;
    260     UErrorCode status = U_ZERO_ERROR;
    261     PossibleWord words[THAI_LOOKAHEAD];
    262     UChar32 uc;
    263 
    264     utext_setNativeIndex(text, rangeStart);
    265 
    266     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
    267         wordLength = 0;
    268 
    269         // Look for candidate words at the current position
    270         int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    271 
    272         // If we found exactly one, use that
    273         if (candidates == 1) {
    274             wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(text);
    275             wordsFound += 1;
    276         }
    277 
    278         // If there was more than one, see which one can take us forward the most words
    279         else if (candidates > 1) {
    280             // If we're already at the end of the range, we're done
    281             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
    282                 goto foundBest;
    283             }
    284             do {
    285                 int wordsMatched = 1;
    286                 if (words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
    287                     if (wordsMatched < 2) {
    288                         // Followed by another dictionary word; mark first word as a good candidate
    289                         words[wordsFound%THAI_LOOKAHEAD].markCurrent();
    290                         wordsMatched = 2;
    291                     }
    292 
    293                     // If we're already at the end of the range, we're done
    294                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
    295                         goto foundBest;
    296                     }
    297 
    298                     // See if any of the possible second words is followed by a third word
    299                     do {
    300                         // If we find a third word, stop right away
    301                         if (words[(wordsFound+2)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
    302                             words[wordsFound%THAI_LOOKAHEAD].markCurrent();
    303                             goto foundBest;
    304                         }
    305                     }
    306                     while (words[(wordsFound+1)%THAI_LOOKAHEAD].backUp(text));
    307                 }
    308             }
    309             while (words[wordsFound%THAI_LOOKAHEAD].backUp(text));
    310 foundBest:
    311             wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(text);
    312             wordsFound += 1;
    313         }
    314 
    315         // We come here after having either found a word or not. We look ahead to the
    316         // next word. If it's not a dictionary word, we will combine it withe the word we
    317         // just found (if there is one), but only if the preceding word does not exceed
    318         // the threshold.
    319         // The text iterator should now be positioned at the end of the word we found.
    320         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) {
    321             // if it is a dictionary word, do nothing. If it isn't, then if there is
    322             // no preceding word, or the non-word shares less than the minimum threshold
    323             // of characters with a dictionary word, then scan to resynchronize
    324             if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
    325                   && (wordLength == 0
    326                       || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
    327                 // Look for a plausible word boundary
    328                 //TODO: This section will need a rework for UText.
    329                 int32_t remaining = rangeEnd - (current+wordLength);
    330                 UChar32 pc = utext_current32(text);
    331                 int32_t chars = 0;
    332                 for (;;) {
    333                     utext_next32(text);
    334                     uc = utext_current32(text);
    335                     // TODO: Here we're counting on the fact that the SA languages are all
    336                     // in the BMP. This should get fixed with the UText rework.
    337                     chars += 1;
    338                     if (--remaining <= 0) {
    339                         break;
    340                     }
    341                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
    342                         // Maybe. See if it's in the dictionary.
    343                         // NOTE: In the original Apple code, checked that the next
    344                         // two characters after uc were not 0x0E4C THANTHAKHAT before
    345                         // checking the dictionary. That is just a performance filter,
    346                         // but it's not clear it's faster than checking the trie.
    347                         int candidates = words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    348                         utext_setNativeIndex(text, current+wordLength+chars);
    349                         if (candidates > 0) {
    350                             break;
    351                         }
    352                     }
    353                     pc = uc;
    354                 }
    355 
    356                 // Bump the word count if there wasn't already one
    357                 if (wordLength <= 0) {
    358                     wordsFound += 1;
    359                 }
    360 
    361                 // Update the length with the passed-over characters
    362                 wordLength += chars;
    363             }
    364             else {
    365                 // Back up to where we were for next iteration
    366                 utext_setNativeIndex(text, current+wordLength);
    367             }
    368         }
    369 
    370         // Never stop before a combining mark.
    371         int32_t currPos;
    372         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
    373             utext_next32(text);
    374             wordLength += (int32_t)utext_getNativeIndex(text) - currPos;
    375         }
    376 
    377         // Look ahead for possible suffixes if a dictionary word does not follow.
    378         // We do this in code rather than using a rule so that the heuristic
    379         // resynch continues to function. For example, one of the suffix characters
    380         // could be a typo in the middle of a word.
    381         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
    382             if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
    383                 && fSuffixSet.contains(uc = utext_current32(text))) {
    384                 if (uc == THAI_PAIYANNOI) {
    385                     if (!fSuffixSet.contains(utext_previous32(text))) {
    386                         // Skip over previous end and PAIYANNOI
    387                         utext_next32(text);
    388                         utext_next32(text);
    389                         wordLength += 1;            // Add PAIYANNOI to word
    390                         uc = utext_current32(text);     // Fetch next character
    391                     }
    392                     else {
    393                         // Restore prior position
    394                         utext_next32(text);
    395                     }
    396                 }
    397                 if (uc == THAI_MAIYAMOK) {
    398                     if (utext_previous32(text) != THAI_MAIYAMOK) {
    399                         // Skip over previous end and MAIYAMOK
    400                         utext_next32(text);
    401                         utext_next32(text);
    402                         wordLength += 1;            // Add MAIYAMOK to word
    403                     }
    404                     else {
    405                         // Restore prior position
    406                         utext_next32(text);
    407                     }
    408                 }
    409             }
    410             else {
    411                 utext_setNativeIndex(text, current+wordLength);
    412             }
    413         }
    414 
    415         // Did we find a word on this iteration? If so, push it on the break stack
    416         if (wordLength > 0) {
    417             foundBreaks.push((current+wordLength), status);
    418         }
    419     }
    420 
    421     // Don't return a break for the end of the dictionary range if there is one there.
    422     if (foundBreaks.peeki() >= rangeEnd) {
    423         (void) foundBreaks.popi();
    424         wordsFound -= 1;
    425     }
    426 
    427     return wordsFound;
    428 }
    429 
    430 /*
    431  ******************************************************************
    432  * CjkBreakEngine
    433  */
    434 static const uint32_t kuint32max = 0xFFFFFFFF;
    435 CjkBreakEngine::CjkBreakEngine(const TrieWordDictionary *adoptDictionary, LanguageType type, UErrorCode &status)
    436 : DictionaryBreakEngine(1<<UBRK_WORD), fDictionary(adoptDictionary){
    437     if (!adoptDictionary->getValued()) {
    438         status = U_ILLEGAL_ARGUMENT_ERROR;
    439         return;
    440     }
    441 
    442     // Korean dictionary only includes Hangul syllables
    443     fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
    444     fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
    445     fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
    446     fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
    447 
    448     if (U_SUCCESS(status)) {
    449         // handle Korean and Japanese/Chinese using different dictionaries
    450         if (type == kKorean) {
    451             setCharacters(fHangulWordSet);
    452         } else { //Chinese and Japanese
    453             UnicodeSet cjSet;
    454             cjSet.addAll(fHanWordSet);
    455             cjSet.addAll(fKatakanaWordSet);
    456             cjSet.addAll(fHiraganaWordSet);
    457             cjSet.add(UNICODE_STRING_SIMPLE("\\uff70\\u30fc"));
    458             setCharacters(cjSet);
    459         }
    460     }
    461 }
    462 
    463 CjkBreakEngine::~CjkBreakEngine(){
    464     delete fDictionary;
    465 }
    466 
    467 // The katakanaCost values below are based on the length frequencies of all
    468 // katakana phrases in the dictionary
    469 static const int kMaxKatakanaLength = 8;
    470 static const int kMaxKatakanaGroupLength = 20;
    471 static const uint32_t maxSnlp = 255;
    472 
    473 static inline uint32_t getKatakanaCost(int wordLength){
    474     //TODO: fill array with actual values from dictionary!
    475     static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
    476                                        = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
    477     return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
    478 }
    479 
    480 static inline bool isKatakana(uint16_t value) {
    481     return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) ||
    482             (value >= 0xFF66u && value <= 0xFF9fu);
    483 }
    484 
    485 // A very simple helper class to streamline the buffer handling in
    486 // divideUpDictionaryRange.
    487 template<class T, size_t N>
    488 class AutoBuffer {
    489  public:
    490   AutoBuffer(size_t size) : buffer(stackBuffer), capacity(N) {
    491     if (size > N) {
    492       buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));
    493       capacity = size;
    494     }
    495   }
    496   ~AutoBuffer() {
    497     if (buffer != stackBuffer)
    498       uprv_free(buffer);
    499   }
    500 #if 0
    501   T* operator& () {
    502     return buffer;
    503   }
    504 #endif
    505   T* elems() {
    506     return buffer;
    507   }
    508   const T& operator[] (size_t i) const {
    509     return buffer[i];
    510   }
    511   T& operator[] (size_t i) {
    512     return buffer[i];
    513   }
    514 
    515   // resize without copy
    516   void resize(size_t size) {
    517     if (size <= capacity)
    518       return;
    519     if (buffer != stackBuffer)
    520       uprv_free(buffer);
    521     buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));
    522     capacity = size;
    523   }
    524  private:
    525   T stackBuffer[N];
    526   T* buffer;
    527   AutoBuffer();
    528   size_t capacity;
    529 };
    530 
    531 
    532 /*
    533  * @param text A UText representing the text
    534  * @param rangeStart The start of the range of dictionary characters
    535  * @param rangeEnd The end of the range of dictionary characters
    536  * @param foundBreaks Output of C array of int32_t break positions, or 0
    537  * @return The number of breaks found
    538  */
    539 int32_t
    540 CjkBreakEngine::divideUpDictionaryRange( UText *text,
    541         int32_t rangeStart,
    542         int32_t rangeEnd,
    543         UStack &foundBreaks ) const {
    544     if (rangeStart >= rangeEnd) {
    545         return 0;
    546     }
    547 
    548     const size_t defaultInputLength = 80;
    549     size_t inputLength = rangeEnd - rangeStart;
    550     AutoBuffer<UChar, defaultInputLength> charString(inputLength);
    551 
    552     // Normalize the input string and put it in normalizedText.
    553     // The map from the indices of the normalized input to the raw
    554     // input is kept in charPositions.
    555     UErrorCode status = U_ZERO_ERROR;
    556     utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, &status);
    557     if (U_FAILURE(status))
    558         return 0;
    559 
    560     UnicodeString inputString(charString.elems(), inputLength);
    561     UNormalizationMode norm_mode = UNORM_NFKC;
    562     UBool isNormalized =
    563         Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES ||
    564         Normalizer::isNormalized(inputString, norm_mode, status);
    565 
    566     AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1);
    567     int numChars = 0;
    568     UText normalizedText = UTEXT_INITIALIZER;
    569     // Needs to be declared here because normalizedText holds onto its buffer.
    570     UnicodeString normalizedString;
    571     if (isNormalized) {
    572         int32_t index = 0;
    573         charPositions[0] = 0;
    574         while(index < inputString.length()) {
    575             index = inputString.moveIndex32(index, 1);
    576             charPositions[++numChars] = index;
    577         }
    578         utext_openUnicodeString(&normalizedText, &inputString, &status);
    579     }
    580     else {
    581         Normalizer::normalize(inputString, norm_mode, 0, normalizedString, status);
    582         if (U_FAILURE(status))
    583             return 0;
    584         charPositions.resize(normalizedString.length() + 1);
    585         Normalizer normalizer(charString.elems(), inputLength, norm_mode);
    586         int32_t index = 0;
    587         charPositions[0] = 0;
    588         while(index < normalizer.endIndex()){
    589             UChar32 uc = normalizer.next();
    590             charPositions[++numChars] = index = normalizer.getIndex();
    591         }
    592         utext_openUnicodeString(&normalizedText, &normalizedString, &status);
    593     }
    594 
    595     if (U_FAILURE(status))
    596         return 0;
    597 
    598     // From this point on, all the indices refer to the indices of
    599     // the normalized input string.
    600 
    601     // bestSnlp[i] is the snlp of the best segmentation of the first i
    602     // characters in the range to be matched.
    603     AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1);
    604     bestSnlp[0] = 0;
    605     for(int i=1; i<=numChars; i++){
    606         bestSnlp[i] = kuint32max;
    607     }
    608 
    609     // prev[i] is the index of the last CJK character in the previous word in
    610     // the best segmentation of the first i characters.
    611     AutoBuffer<int, defaultInputLength> prev(numChars + 1);
    612     for(int i=0; i<=numChars; i++){
    613         prev[i] = -1;
    614     }
    615 
    616     const size_t maxWordSize = 20;
    617     AutoBuffer<uint16_t, maxWordSize> values(numChars);
    618     AutoBuffer<int32_t, maxWordSize> lengths(numChars);
    619 
    620     // Dynamic programming to find the best segmentation.
    621     bool is_prev_katakana = false;
    622     for (int i = 0; i < numChars; ++i) {
    623         //utext_setNativeIndex(text, rangeStart + i);
    624         utext_setNativeIndex(&normalizedText, i);
    625         if (bestSnlp[i] == kuint32max)
    626             continue;
    627 
    628         int count;
    629         // limit maximum word length matched to size of current substring
    630         int maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSize: numChars - i;
    631 
    632         fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems());
    633 
    634         // if there are no single character matches found in the dictionary
    635         // starting with this charcter, treat character as a 1-character word
    636         // with the highest value possible, i.e. the least likely to occur.
    637         // Exclude Korean characters from this treatment, as they should be left
    638         // together by default.
    639         if((count == 0 || lengths[0] != 1) &&
    640                 !fHangulWordSet.contains(utext_current32(&normalizedText))){
    641             values[count] = maxSnlp;
    642             lengths[count++] = 1;
    643         }
    644 
    645         for (int j = 0; j < count; j++){
    646             //U_ASSERT(values[j] >= 0 && values[j] <= maxSnlp);
    647             uint32_t newSnlp = bestSnlp[i] + values[j];
    648             if (newSnlp < bestSnlp[lengths[j] + i]) {
    649                 bestSnlp[lengths[j] + i] = newSnlp;
    650                 prev[lengths[j] + i] = i;
    651             }
    652         }
    653 
    654         // In Japanese,
    655         // Katakana word in single character is pretty rare. So we apply
    656         // the following heuristic to Katakana: any continuous run of Katakana
    657         // characters is considered a candidate word with a default cost
    658         // specified in the katakanaCost table according to its length.
    659         //utext_setNativeIndex(text, rangeStart + i);
    660         utext_setNativeIndex(&normalizedText, i);
    661         bool is_katakana = isKatakana(utext_current32(&normalizedText));
    662         if (!is_prev_katakana && is_katakana) {
    663             int j = i + 1;
    664             utext_next32(&normalizedText);
    665             // Find the end of the continuous run of Katakana characters
    666             while (j < numChars && (j - i) < kMaxKatakanaGroupLength &&
    667                     isKatakana(utext_current32(&normalizedText))) {
    668                 utext_next32(&normalizedText);
    669                 ++j;
    670             }
    671             if ((j - i) < kMaxKatakanaGroupLength) {
    672                 uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i);
    673                 if (newSnlp < bestSnlp[j]) {
    674                     bestSnlp[j] = newSnlp;
    675                     prev[j] = i;
    676                 }
    677             }
    678         }
    679         is_prev_katakana = is_katakana;
    680     }
    681 
    682     // Start pushing the optimal offset index into t_boundary (t for tentative).
    683     // prev[numChars] is guaranteed to be meaningful.
    684     // We'll first push in the reverse order, i.e.,
    685     // t_boundary[0] = numChars, and afterwards do a swap.
    686     AutoBuffer<int, maxWordSize> t_boundary(numChars + 1);
    687 
    688     int numBreaks = 0;
    689     // No segmentation found, set boundary to end of range
    690     if (bestSnlp[numChars] == kuint32max) {
    691         t_boundary[numBreaks++] = numChars;
    692     } else {
    693         for (int i = numChars; i > 0; i = prev[i]){
    694             t_boundary[numBreaks++] = i;
    695 
    696         }
    697         U_ASSERT(prev[t_boundary[numBreaks-1]] == 0);
    698     }
    699 
    700     // Reverse offset index in t_boundary.
    701     // Don't add a break for the start of the dictionary range if there is one
    702     // there already.
    703     if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
    704         t_boundary[numBreaks++] = 0;
    705     }
    706 
    707     // Now that we're done, convert positions in t_bdry[] (indices in
    708     // the normalized input string) back to indices in the raw input string
    709     // while reversing t_bdry and pushing values to foundBreaks.
    710     for (int i = numBreaks-1; i >= 0; i--) {
    711         foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status);
    712     }
    713 
    714     utext_close(&normalizedText);
    715     return numBreaks;
    716 }
    717 
    718 U_NAMESPACE_END
    719 
    720 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
    721