Home | History | Annotate | Download | only in common
      1 /**
      2  *******************************************************************************
      3  * Copyright (C) 2006-2015, International Business Machines Corporation
      4  * and others. All Rights Reserved.
      5  *******************************************************************************
      6  */
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #if !UCONFIG_NO_BREAK_ITERATION
     11 
     12 #include "brkeng.h"
     13 #include "dictbe.h"
     14 #include "unicode/uniset.h"
     15 #include "unicode/chariter.h"
     16 #include "unicode/ubrk.h"
     17 #include "uvectr32.h"
     18 #include "uvector.h"
     19 #include "uassert.h"
     20 #include "unicode/normlzr.h"
     21 #include "cmemory.h"
     22 #include "dictionarydata.h"
     23 
     24 U_NAMESPACE_BEGIN
     25 
     26 /*
     27  ******************************************************************
     28  */
     29 
     30 DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) {
     31     fTypes = breakTypes;
     32 }
     33 
     34 DictionaryBreakEngine::~DictionaryBreakEngine() {
     35 }
     36 
     37 UBool
     38 DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const {
     39     return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)
     40             && fSet.contains(c));
     41 }
     42 
     43 int32_t
     44 DictionaryBreakEngine::findBreaks( UText *text,
     45                                  int32_t startPos,
     46                                  int32_t endPos,
     47                                  UBool reverse,
     48                                  int32_t breakType,
     49                                  UStack &foundBreaks ) const {
     50     int32_t result = 0;
     51 
     52     // Find the span of characters included in the set.
     53     //   The span to break begins at the current position in the text, and
     54     //   extends towards the start or end of the text, depending on 'reverse'.
     55 
     56     int32_t start = (int32_t)utext_getNativeIndex(text);
     57     int32_t current;
     58     int32_t rangeStart;
     59     int32_t rangeEnd;
     60     UChar32 c = utext_current32(text);
     61     if (reverse) {
     62         UBool   isDict = fSet.contains(c);
     63         while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) {
     64             c = utext_previous32(text);
     65             isDict = fSet.contains(c);
     66         }
     67         if (current < startPos) {
     68             rangeStart = startPos;
     69         } else {
     70             rangeStart = current;
     71             if (!isDict) {
     72                 utext_next32(text);
     73                 rangeStart = utext_getNativeIndex(text);
     74             }
     75         }
     76         // rangeEnd = start + 1;
     77         utext_setNativeIndex(text, start);
     78         utext_next32(text);
     79         rangeEnd = utext_getNativeIndex(text);
     80     }
     81     else {
     82         while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
     83             utext_next32(text);         // TODO:  recast loop for postincrement
     84             c = utext_current32(text);
     85         }
     86         rangeStart = start;
     87         rangeEnd = current;
     88     }
     89     if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
     90         result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
     91         utext_setNativeIndex(text, current);
     92     }
     93 
     94     return result;
     95 }
     96 
     97 void
     98 DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
     99     fSet = set;
    100     // Compact for caching
    101     fSet.compact();
    102 }
    103 
    104 /*
    105  ******************************************************************
    106  * PossibleWord
    107  */
    108 
    109 // Helper class for improving readability of the Thai/Lao/Khmer word break
    110 // algorithm. The implementation is completely inline.
    111 
    112 // List size, limited by the maximum number of words in the dictionary
    113 // that form a nested sequence.
    114 static const int32_t POSSIBLE_WORD_LIST_MAX = 20;
    115 
    116 class PossibleWord {
    117 private:
    118     // list of word candidate lengths, in increasing length order
    119     // TODO: bytes would be sufficient for word lengths.
    120     int32_t   count;      // Count of candidates
    121     int32_t   prefix;     // The longest match with a dictionary word
    122     int32_t   offset;     // Offset in the text of these candidates
    123     int32_t   mark;       // The preferred candidate's offset
    124     int32_t   current;    // The candidate we're currently looking at
    125     int32_t   cuLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code units.
    126     int32_t   cpLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code points.
    127 
    128 public:
    129     PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {};
    130     ~PossibleWord() {};
    131 
    132     // Fill the list of candidates if needed, select the longest, and return the number found
    133     int32_t   candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
    134 
    135     // Select the currently marked candidate, point after it in the text, and invalidate self
    136     int32_t   acceptMarked( UText *text );
    137 
    138     // Back up from the current candidate to the next shorter one; return TRUE if that exists
    139     // and point the text after it
    140     UBool     backUp( UText *text );
    141 
    142     // Return the longest prefix this candidate location shares with a dictionary word
    143     // Return value is in code points.
    144     int32_t   longestPrefix() { return prefix; };
    145 
    146     // Mark the current candidate as the one we like
    147     void      markCurrent() { mark = current; };
    148 
    149     // Get length in code points of the marked word.
    150     int32_t   markedCPLength() { return cpLengths[mark]; };
    151 };
    152 
    153 
    154 int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
    155     // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
    156     int32_t start = (int32_t)utext_getNativeIndex(text);
    157     if (start != offset) {
    158         offset = start;
    159         count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix);
    160         // Dictionary leaves text after longest prefix, not longest word. Back up.
    161         if (count <= 0) {
    162             utext_setNativeIndex(text, start);
    163         }
    164     }
    165     if (count > 0) {
    166         utext_setNativeIndex(text, start+cuLengths[count-1]);
    167     }
    168     current = count-1;
    169     mark = current;
    170     return count;
    171 }
    172 
    173 int32_t
    174 PossibleWord::acceptMarked( UText *text ) {
    175     utext_setNativeIndex(text, offset + cuLengths[mark]);
    176     return cuLengths[mark];
    177 }
    178 
    179 
    180 UBool
    181 PossibleWord::backUp( UText *text ) {
    182     if (current > 0) {
    183         utext_setNativeIndex(text, offset + cuLengths[--current]);
    184         return TRUE;
    185     }
    186     return FALSE;
    187 }
    188 
    189 /*
    190  ******************************************************************
    191  * ThaiBreakEngine
    192  */
    193 
    194 // How many words in a row are "good enough"?
    195 static const int32_t THAI_LOOKAHEAD = 3;
    196 
    197 // Will not combine a non-word with a preceding dictionary word longer than this
    198 static const int32_t THAI_ROOT_COMBINE_THRESHOLD = 3;
    199 
    200 // Will not combine a non-word that shares at least this much prefix with a
    201 // dictionary word, with a preceding word
    202 static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = 3;
    203 
    204 // Ellision character
    205 static const int32_t THAI_PAIYANNOI = 0x0E2F;
    206 
    207 // Repeat character
    208 static const int32_t THAI_MAIYAMOK = 0x0E46;
    209 
    210 // Minimum word size
    211 static const int32_t THAI_MIN_WORD = 2;
    212 
    213 // Minimum number of characters for two words
    214 static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
    215 
    216 ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
    217     : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
    218       fDictionary(adoptDictionary)
    219 {
    220     fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
    221     if (U_SUCCESS(status)) {
    222         setCharacters(fThaiWordSet);
    223     }
    224     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
    225     fMarkSet.add(0x0020);
    226     fEndWordSet = fThaiWordSet;
    227     fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
    228     fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
    229     fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
    230     fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
    231     fSuffixSet.add(THAI_PAIYANNOI);
    232     fSuffixSet.add(THAI_MAIYAMOK);
    233 
    234     // Compact for caching.
    235     fMarkSet.compact();
    236     fEndWordSet.compact();
    237     fBeginWordSet.compact();
    238     fSuffixSet.compact();
    239 }
    240 
    241 ThaiBreakEngine::~ThaiBreakEngine() {
    242     delete fDictionary;
    243 }
    244 
    245 int32_t
    246 ThaiBreakEngine::divideUpDictionaryRange( UText *text,
    247                                                 int32_t rangeStart,
    248                                                 int32_t rangeEnd,
    249                                                 UStack &foundBreaks ) const {
    250     utext_setNativeIndex(text, rangeStart);
    251     utext_moveIndex32(text, THAI_MIN_WORD_SPAN);
    252     if (utext_getNativeIndex(text) >= rangeEnd) {
    253         return 0;       // Not enough characters for two words
    254     }
    255     utext_setNativeIndex(text, rangeStart);
    256 
    257 
    258     uint32_t wordsFound = 0;
    259     int32_t cpWordLength = 0;    // Word Length in Code Points.
    260     int32_t cuWordLength = 0;    // Word length in code units (UText native indexing)
    261     int32_t current;
    262     UErrorCode status = U_ZERO_ERROR;
    263     PossibleWord words[THAI_LOOKAHEAD];
    264 
    265     utext_setNativeIndex(text, rangeStart);
    266 
    267     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
    268         cpWordLength = 0;
    269         cuWordLength = 0;
    270 
    271         // Look for candidate words at the current position
    272         int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    273 
    274         // If we found exactly one, use that
    275         if (candidates == 1) {
    276             cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
    277             cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
    278             wordsFound += 1;
    279         }
    280         // If there was more than one, see which one can take us forward the most words
    281         else if (candidates > 1) {
    282             // If we're already at the end of the range, we're done
    283             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
    284                 goto foundBest;
    285             }
    286             do {
    287                 int32_t wordsMatched = 1;
    288                 if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
    289                     if (wordsMatched < 2) {
    290                         // Followed by another dictionary word; mark first word as a good candidate
    291                         words[wordsFound%THAI_LOOKAHEAD].markCurrent();
    292                         wordsMatched = 2;
    293                     }
    294 
    295                     // If we're already at the end of the range, we're done
    296                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
    297                         goto foundBest;
    298                     }
    299 
    300                     // See if any of the possible second words is followed by a third word
    301                     do {
    302                         // If we find a third word, stop right away
    303                         if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
    304                             words[wordsFound % THAI_LOOKAHEAD].markCurrent();
    305                             goto foundBest;
    306                         }
    307                     }
    308                     while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));
    309                 }
    310             }
    311             while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
    312 foundBest:
    313             // Set UText position to after the accepted word.
    314             cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
    315             cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
    316             wordsFound += 1;
    317         }
    318 
    319         // We come here after having either found a word or not. We look ahead to the
    320         // next word. If it's not a dictionary word, we will combine it with the word we
    321         // just found (if there is one), but only if the preceding word does not exceed
    322         // the threshold.
    323         // The text iterator should now be positioned at the end of the word we found.
    324 
    325         UChar32 uc = 0;
    326         if ((int32_t)utext_getNativeIndex(text) < rangeEnd &&  cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) {
    327             // if it is a dictionary word, do nothing. If it isn't, then if there is
    328             // no preceding word, or the non-word shares less than the minimum threshold
    329             // of characters with a dictionary word, then scan to resynchronize
    330             if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
    331                   && (cuWordLength == 0
    332                       || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
    333                 // Look for a plausible word boundary
    334                 int32_t remaining = rangeEnd - (current+cuWordLength);
    335                 UChar32 pc;
    336                 int32_t chars = 0;
    337                 for (;;) {
    338                     int32_t pcIndex = utext_getNativeIndex(text);
    339                     pc = utext_next32(text);
    340                     int32_t pcSize = utext_getNativeIndex(text) - pcIndex;
    341                     chars += pcSize;
    342                     remaining -= pcSize;
    343                     if (remaining <= 0) {
    344                         break;
    345                     }
    346                     uc = utext_current32(text);
    347                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
    348                         // Maybe. See if it's in the dictionary.
    349                         // NOTE: In the original Apple code, checked that the next
    350                         // two characters after uc were not 0x0E4C THANTHAKHAT before
    351                         // checking the dictionary. That is just a performance filter,
    352                         // but it's not clear it's faster than checking the trie.
    353                         int32_t candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    354                         utext_setNativeIndex(text, current + cuWordLength + chars);
    355                         if (candidates > 0) {
    356                             break;
    357                         }
    358                     }
    359                 }
    360 
    361                 // Bump the word count if there wasn't already one
    362                 if (cuWordLength <= 0) {
    363                     wordsFound += 1;
    364                 }
    365 
    366                 // Update the length with the passed-over characters
    367                 cuWordLength += chars;
    368             }
    369             else {
    370                 // Back up to where we were for next iteration
    371                 utext_setNativeIndex(text, current+cuWordLength);
    372             }
    373         }
    374 
    375         // Never stop before a combining mark.
    376         int32_t currPos;
    377         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
    378             utext_next32(text);
    379             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
    380         }
    381 
    382         // Look ahead for possible suffixes if a dictionary word does not follow.
    383         // We do this in code rather than using a rule so that the heuristic
    384         // resynch continues to function. For example, one of the suffix characters
    385         // could be a typo in the middle of a word.
    386         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cuWordLength > 0) {
    387             if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
    388                 && fSuffixSet.contains(uc = utext_current32(text))) {
    389                 if (uc == THAI_PAIYANNOI) {
    390                     if (!fSuffixSet.contains(utext_previous32(text))) {
    391                         // Skip over previous end and PAIYANNOI
    392                         utext_next32(text);
    393                         int32_t paiyannoiIndex = utext_getNativeIndex(text);
    394                         utext_next32(text);
    395                         cuWordLength += utext_getNativeIndex(text) - paiyannoiIndex;    // Add PAIYANNOI to word
    396                         uc = utext_current32(text);     // Fetch next character
    397                     }
    398                     else {
    399                         // Restore prior position
    400                         utext_next32(text);
    401                     }
    402                 }
    403                 if (uc == THAI_MAIYAMOK) {
    404                     if (utext_previous32(text) != THAI_MAIYAMOK) {
    405                         // Skip over previous end and MAIYAMOK
    406                         utext_next32(text);
    407                         int32_t maiyamokIndex = utext_getNativeIndex(text);
    408                         utext_next32(text);
    409                         cuWordLength += utext_getNativeIndex(text) - maiyamokIndex;    // Add MAIYAMOK to word
    410                     }
    411                     else {
    412                         // Restore prior position
    413                         utext_next32(text);
    414                     }
    415                 }
    416             }
    417             else {
    418                 utext_setNativeIndex(text, current+cuWordLength);
    419             }
    420         }
    421 
    422         // Did we find a word on this iteration? If so, push it on the break stack
    423         if (cuWordLength > 0) {
    424             foundBreaks.push((current+cuWordLength), status);
    425         }
    426     }
    427 
    428     // Don't return a break for the end of the dictionary range if there is one there.
    429     if (foundBreaks.peeki() >= rangeEnd) {
    430         (void) foundBreaks.popi();
    431         wordsFound -= 1;
    432     }
    433 
    434     return wordsFound;
    435 }
    436 
    437 /*
    438  ******************************************************************
    439  * LaoBreakEngine
    440  */
    441 
    442 // How many words in a row are "good enough"?
    443 static const int32_t LAO_LOOKAHEAD = 3;
    444 
    445 // Will not combine a non-word with a preceding dictionary word longer than this
    446 static const int32_t LAO_ROOT_COMBINE_THRESHOLD = 3;
    447 
    448 // Will not combine a non-word that shares at least this much prefix with a
    449 // dictionary word, with a preceding word
    450 static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = 3;
    451 
    452 // Minimum word size
    453 static const int32_t LAO_MIN_WORD = 2;
    454 
    455 // Minimum number of characters for two words
    456 static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2;
    457 
    458 LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
    459     : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
    460       fDictionary(adoptDictionary)
    461 {
    462     fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
    463     if (U_SUCCESS(status)) {
    464         setCharacters(fLaoWordSet);
    465     }
    466     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
    467     fMarkSet.add(0x0020);
    468     fEndWordSet = fLaoWordSet;
    469     fEndWordSet.remove(0x0EC0, 0x0EC4);     // prefix vowels
    470     fBeginWordSet.add(0x0E81, 0x0EAE);      // basic consonants (including holes for corresponding Thai characters)
    471     fBeginWordSet.add(0x0EDC, 0x0EDD);      // digraph consonants (no Thai equivalent)
    472     fBeginWordSet.add(0x0EC0, 0x0EC4);      // prefix vowels
    473 
    474     // Compact for caching.
    475     fMarkSet.compact();
    476     fEndWordSet.compact();
    477     fBeginWordSet.compact();
    478 }
    479 
    480 LaoBreakEngine::~LaoBreakEngine() {
    481     delete fDictionary;
    482 }
    483 
    484 int32_t
    485 LaoBreakEngine::divideUpDictionaryRange( UText *text,
    486                                                 int32_t rangeStart,
    487                                                 int32_t rangeEnd,
    488                                                 UStack &foundBreaks ) const {
    489     if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
    490         return 0;       // Not enough characters for two words
    491     }
    492 
    493     uint32_t wordsFound = 0;
    494     int32_t cpWordLength = 0;
    495     int32_t cuWordLength = 0;
    496     int32_t current;
    497     UErrorCode status = U_ZERO_ERROR;
    498     PossibleWord words[LAO_LOOKAHEAD];
    499 
    500     utext_setNativeIndex(text, rangeStart);
    501 
    502     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
    503         cuWordLength = 0;
    504         cpWordLength = 0;
    505 
    506         // Look for candidate words at the current position
    507         int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    508 
    509         // If we found exactly one, use that
    510         if (candidates == 1) {
    511             cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
    512             cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
    513             wordsFound += 1;
    514         }
    515         // If there was more than one, see which one can take us forward the most words
    516         else if (candidates > 1) {
    517             // If we're already at the end of the range, we're done
    518             if (utext_getNativeIndex(text) >= rangeEnd) {
    519                 goto foundBest;
    520             }
    521             do {
    522                 int32_t wordsMatched = 1;
    523                 if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
    524                     if (wordsMatched < 2) {
    525                         // Followed by another dictionary word; mark first word as a good candidate
    526                         words[wordsFound%LAO_LOOKAHEAD].markCurrent();
    527                         wordsMatched = 2;
    528                     }
    529 
    530                     // If we're already at the end of the range, we're done
    531                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
    532                         goto foundBest;
    533                     }
    534 
    535                     // See if any of the possible second words is followed by a third word
    536                     do {
    537                         // If we find a third word, stop right away
    538                         if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
    539                             words[wordsFound % LAO_LOOKAHEAD].markCurrent();
    540                             goto foundBest;
    541                         }
    542                     }
    543                     while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));
    544                 }
    545             }
    546             while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
    547 foundBest:
    548             cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
    549             cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
    550             wordsFound += 1;
    551         }
    552 
    553         // We come here after having either found a word or not. We look ahead to the
    554         // next word. If it's not a dictionary word, we will combine it withe the word we
    555         // just found (if there is one), but only if the preceding word does not exceed
    556         // the threshold.
    557         // The text iterator should now be positioned at the end of the word we found.
    558         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < LAO_ROOT_COMBINE_THRESHOLD) {
    559             // if it is a dictionary word, do nothing. If it isn't, then if there is
    560             // no preceding word, or the non-word shares less than the minimum threshold
    561             // of characters with a dictionary word, then scan to resynchronize
    562             if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
    563                   && (cuWordLength == 0
    564                       || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
    565                 // Look for a plausible word boundary
    566                 int32_t remaining = rangeEnd - (current + cuWordLength);
    567                 UChar32 pc;
    568                 UChar32 uc;
    569                 int32_t chars = 0;
    570                 for (;;) {
    571                     int32_t pcIndex = utext_getNativeIndex(text);
    572                     pc = utext_next32(text);
    573                     int32_t pcSize = utext_getNativeIndex(text) - pcIndex;
    574                     chars += pcSize;
    575                     remaining -= pcSize;
    576                     if (remaining <= 0) {
    577                         break;
    578                     }
    579                     uc = utext_current32(text);
    580                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
    581                         // Maybe. See if it's in the dictionary.
    582                         // TODO: this looks iffy; compare with old code.
    583                         int32_t candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    584                         utext_setNativeIndex(text, current + cuWordLength + chars);
    585                         if (candidates > 0) {
    586                             break;
    587                         }
    588                     }
    589                 }
    590 
    591                 // Bump the word count if there wasn't already one
    592                 if (cuWordLength <= 0) {
    593                     wordsFound += 1;
    594                 }
    595 
    596                 // Update the length with the passed-over characters
    597                 cuWordLength += chars;
    598             }
    599             else {
    600                 // Back up to where we were for next iteration
    601                 utext_setNativeIndex(text, current + cuWordLength);
    602             }
    603         }
    604 
    605         // Never stop before a combining mark.
    606         int32_t currPos;
    607         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
    608             utext_next32(text);
    609             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
    610         }
    611 
    612         // Look ahead for possible suffixes if a dictionary word does not follow.
    613         // We do this in code rather than using a rule so that the heuristic
    614         // resynch continues to function. For example, one of the suffix characters
    615         // could be a typo in the middle of a word.
    616         // NOT CURRENTLY APPLICABLE TO LAO
    617 
    618         // Did we find a word on this iteration? If so, push it on the break stack
    619         if (cuWordLength > 0) {
    620             foundBreaks.push((current+cuWordLength), status);
    621         }
    622     }
    623 
    624     // Don't return a break for the end of the dictionary range if there is one there.
    625     if (foundBreaks.peeki() >= rangeEnd) {
    626         (void) foundBreaks.popi();
    627         wordsFound -= 1;
    628     }
    629 
    630     return wordsFound;
    631 }
    632 
    633 /*
    634  ******************************************************************
    635  * BurmeseBreakEngine
    636  */
    637 
    638 // How many words in a row are "good enough"?
    639 static const int32_t BURMESE_LOOKAHEAD = 3;
    640 
    641 // Will not combine a non-word with a preceding dictionary word longer than this
    642 static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3;
    643 
    644 // Will not combine a non-word that shares at least this much prefix with a
    645 // dictionary word, with a preceding word
    646 static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3;
    647 
    648 // Minimum word size
    649 static const int32_t BURMESE_MIN_WORD = 2;
    650 
    651 // Minimum number of characters for two words
    652 static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2;
    653 
    654 BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
    655     : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
    656       fDictionary(adoptDictionary)
    657 {
    658     fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
    659     if (U_SUCCESS(status)) {
    660         setCharacters(fBurmeseWordSet);
    661     }
    662     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
    663     fMarkSet.add(0x0020);
    664     fEndWordSet = fBurmeseWordSet;
    665     fBeginWordSet.add(0x1000, 0x102A);      // basic consonants and independent vowels
    666 
    667     // Compact for caching.
    668     fMarkSet.compact();
    669     fEndWordSet.compact();
    670     fBeginWordSet.compact();
    671 }
    672 
    673 BurmeseBreakEngine::~BurmeseBreakEngine() {
    674     delete fDictionary;
    675 }
    676 
    677 int32_t
    678 BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
    679                                                 int32_t rangeStart,
    680                                                 int32_t rangeEnd,
    681                                                 UStack &foundBreaks ) const {
    682     if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
    683         return 0;       // Not enough characters for two words
    684     }
    685 
    686     uint32_t wordsFound = 0;
    687     int32_t cpWordLength = 0;
    688     int32_t cuWordLength = 0;
    689     int32_t current;
    690     UErrorCode status = U_ZERO_ERROR;
    691     PossibleWord words[BURMESE_LOOKAHEAD];
    692 
    693     utext_setNativeIndex(text, rangeStart);
    694 
    695     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
    696         cuWordLength = 0;
    697         cpWordLength = 0;
    698 
    699         // Look for candidate words at the current position
    700         int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    701 
    702         // If we found exactly one, use that
    703         if (candidates == 1) {
    704             cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
    705             cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
    706             wordsFound += 1;
    707         }
    708         // If there was more than one, see which one can take us forward the most words
    709         else if (candidates > 1) {
    710             // If we're already at the end of the range, we're done
    711             if (utext_getNativeIndex(text) >= rangeEnd) {
    712                 goto foundBest;
    713             }
    714             do {
    715                 int32_t wordsMatched = 1;
    716                 if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
    717                     if (wordsMatched < 2) {
    718                         // Followed by another dictionary word; mark first word as a good candidate
    719                         words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
    720                         wordsMatched = 2;
    721                     }
    722 
    723                     // If we're already at the end of the range, we're done
    724                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
    725                         goto foundBest;
    726                     }
    727 
    728                     // See if any of the possible second words is followed by a third word
    729                     do {
    730                         // If we find a third word, stop right away
    731                         if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
    732                             words[wordsFound % BURMESE_LOOKAHEAD].markCurrent();
    733                             goto foundBest;
    734                         }
    735                     }
    736                     while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(text));
    737                 }
    738             }
    739             while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text));
    740 foundBest:
    741             cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
    742             cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
    743             wordsFound += 1;
    744         }
    745 
    746         // We come here after having either found a word or not. We look ahead to the
    747         // next word. If it's not a dictionary word, we will combine it withe the word we
    748         // just found (if there is one), but only if the preceding word does not exceed
    749         // the threshold.
    750         // The text iterator should now be positioned at the end of the word we found.
    751         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {
    752             // if it is a dictionary word, do nothing. If it isn't, then if there is
    753             // no preceding word, or the non-word shares less than the minimum threshold
    754             // of characters with a dictionary word, then scan to resynchronize
    755             if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
    756                   && (cuWordLength == 0
    757                       || words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {
    758                 // Look for a plausible word boundary
    759                 int32_t remaining = rangeEnd - (current + cuWordLength);
    760                 UChar32 pc;
    761                 UChar32 uc;
    762                 int32_t chars = 0;
    763                 for (;;) {
    764                     int32_t pcIndex = utext_getNativeIndex(text);
    765                     pc = utext_next32(text);
    766                     int32_t pcSize = utext_getNativeIndex(text) - pcIndex;
    767                     chars += pcSize;
    768                     remaining -= pcSize;
    769                     if (remaining <= 0) {
    770                         break;
    771                     }
    772                     uc = utext_current32(text);
    773                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
    774                         // Maybe. See if it's in the dictionary.
    775                         // TODO: this looks iffy; compare with old code.
    776                         int32_t candidates = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    777                         utext_setNativeIndex(text, current + cuWordLength + chars);
    778                         if (candidates > 0) {
    779                             break;
    780                         }
    781                     }
    782                 }
    783 
    784                 // Bump the word count if there wasn't already one
    785                 if (cuWordLength <= 0) {
    786                     wordsFound += 1;
    787                 }
    788 
    789                 // Update the length with the passed-over characters
    790                 cuWordLength += chars;
    791             }
    792             else {
    793                 // Back up to where we were for next iteration
    794                 utext_setNativeIndex(text, current + cuWordLength);
    795             }
    796         }
    797 
    798         // Never stop before a combining mark.
    799         int32_t currPos;
    800         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
    801             utext_next32(text);
    802             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
    803         }
    804 
    805         // Look ahead for possible suffixes if a dictionary word does not follow.
    806         // We do this in code rather than using a rule so that the heuristic
    807         // resynch continues to function. For example, one of the suffix characters
    808         // could be a typo in the middle of a word.
    809         // NOT CURRENTLY APPLICABLE TO BURMESE
    810 
    811         // Did we find a word on this iteration? If so, push it on the break stack
    812         if (cuWordLength > 0) {
    813             foundBreaks.push((current+cuWordLength), status);
    814         }
    815     }
    816 
    817     // Don't return a break for the end of the dictionary range if there is one there.
    818     if (foundBreaks.peeki() >= rangeEnd) {
    819         (void) foundBreaks.popi();
    820         wordsFound -= 1;
    821     }
    822 
    823     return wordsFound;
    824 }
    825 
    826 /*
    827  ******************************************************************
    828  * KhmerBreakEngine
    829  */
    830 
    831 // How many words in a row are "good enough"?
    832 static const int32_t KHMER_LOOKAHEAD = 3;
    833 
    834 // Will not combine a non-word with a preceding dictionary word longer than this
    835 static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;
    836 
    837 // Will not combine a non-word that shares at least this much prefix with a
    838 // dictionary word, with a preceding word
    839 static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;
    840 
    841 // Minimum word size
    842 static const int32_t KHMER_MIN_WORD = 2;
    843 
    844 // Minimum number of characters for two words
    845 static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
    846 
    847 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
    848     : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)),
    849       fDictionary(adoptDictionary)
    850 {
    851     fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
    852     if (U_SUCCESS(status)) {
    853         setCharacters(fKhmerWordSet);
    854     }
    855     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
    856     fMarkSet.add(0x0020);
    857     fEndWordSet = fKhmerWordSet;
    858     fBeginWordSet.add(0x1780, 0x17B3);
    859     //fBeginWordSet.add(0x17A3, 0x17A4);      // deprecated vowels
    860     //fEndWordSet.remove(0x17A5, 0x17A9);     // Khmer independent vowels that can't end a word
    861     //fEndWordSet.remove(0x17B2);             // Khmer independent vowel that can't end a word
    862     fEndWordSet.remove(0x17D2);             // KHMER SIGN COENG that combines some following characters
    863     //fEndWordSet.remove(0x17B6, 0x17C5);     // Remove dependent vowels
    864 //    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
    865 //    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
    866 //    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
    867 //    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
    868 //    fSuffixSet.add(THAI_PAIYANNOI);
    869 //    fSuffixSet.add(THAI_MAIYAMOK);
    870 
    871     // Compact for caching.
    872     fMarkSet.compact();
    873     fEndWordSet.compact();
    874     fBeginWordSet.compact();
    875 //    fSuffixSet.compact();
    876 }
    877 
    878 KhmerBreakEngine::~KhmerBreakEngine() {
    879     delete fDictionary;
    880 }
    881 
    882 int32_t
    883 KhmerBreakEngine::divideUpDictionaryRange( UText *text,
    884                                                 int32_t rangeStart,
    885                                                 int32_t rangeEnd,
    886                                                 UStack &foundBreaks ) const {
    887     if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
    888         return 0;       // Not enough characters for two words
    889     }
    890 
    891     uint32_t wordsFound = 0;
    892     int32_t cpWordLength = 0;
    893     int32_t cuWordLength = 0;
    894     int32_t current;
    895     UErrorCode status = U_ZERO_ERROR;
    896     PossibleWord words[KHMER_LOOKAHEAD];
    897 
    898     utext_setNativeIndex(text, rangeStart);
    899 
    900     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
    901         cuWordLength = 0;
    902         cpWordLength = 0;
    903 
    904         // Look for candidate words at the current position
    905         int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    906 
    907         // If we found exactly one, use that
    908         if (candidates == 1) {
    909             cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
    910             cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
    911             wordsFound += 1;
    912         }
    913 
    914         // If there was more than one, see which one can take us forward the most words
    915         else if (candidates > 1) {
    916             // If we're already at the end of the range, we're done
    917             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
    918                 goto foundBest;
    919             }
    920             do {
    921                 int32_t wordsMatched = 1;
    922                 if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
    923                     if (wordsMatched < 2) {
    924                         // Followed by another dictionary word; mark first word as a good candidate
    925                         words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
    926                         wordsMatched = 2;
    927                     }
    928 
    929                     // If we're already at the end of the range, we're done
    930                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
    931                         goto foundBest;
    932                     }
    933 
    934                     // See if any of the possible second words is followed by a third word
    935                     do {
    936                         // If we find a third word, stop right away
    937                         if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
    938                             words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
    939                             goto foundBest;
    940                         }
    941                     }
    942                     while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
    943                 }
    944             }
    945             while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
    946 foundBest:
    947             cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
    948             cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
    949             wordsFound += 1;
    950         }
    951 
    952         // We come here after having either found a word or not. We look ahead to the
    953         // next word. If it's not a dictionary word, we will combine it with the word we
    954         // just found (if there is one), but only if the preceding word does not exceed
    955         // the threshold.
    956         // The text iterator should now be positioned at the end of the word we found.
    957         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
    958             // if it is a dictionary word, do nothing. If it isn't, then if there is
    959             // no preceding word, or the non-word shares less than the minimum threshold
    960             // of characters with a dictionary word, then scan to resynchronize
    961             if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
    962                   && (cuWordLength == 0
    963                       || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
    964                 // Look for a plausible word boundary
    965                 int32_t remaining = rangeEnd - (current+cuWordLength);
    966                 UChar32 pc;
    967                 UChar32 uc;
    968                 int32_t chars = 0;
    969                 for (;;) {
    970                     int32_t pcIndex = utext_getNativeIndex(text);
    971                     pc = utext_next32(text);
    972                     int32_t pcSize = utext_getNativeIndex(text) - pcIndex;
    973                     chars += pcSize;
    974                     remaining -= pcSize;
    975                     if (remaining <= 0) {
    976                         break;
    977                     }
    978                     uc = utext_current32(text);
    979                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
    980                         // Maybe. See if it's in the dictionary.
    981                         int32_t candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    982                         utext_setNativeIndex(text, current+cuWordLength+chars);
    983                         if (candidates > 0) {
    984                             break;
    985                         }
    986                     }
    987                 }
    988 
    989                 // Bump the word count if there wasn't already one
    990                 if (cuWordLength <= 0) {
    991                     wordsFound += 1;
    992                 }
    993 
    994                 // Update the length with the passed-over characters
    995                 cuWordLength += chars;
    996             }
    997             else {
    998                 // Back up to where we were for next iteration
    999                 utext_setNativeIndex(text, current+cuWordLength);
   1000             }
   1001         }
   1002 
   1003         // Never stop before a combining mark.
   1004         int32_t currPos;
   1005         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
   1006             utext_next32(text);
   1007             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
   1008         }
   1009 
   1010         // Look ahead for possible suffixes if a dictionary word does not follow.
   1011         // We do this in code rather than using a rule so that the heuristic
   1012         // resynch continues to function. For example, one of the suffix characters
   1013         // could be a typo in the middle of a word.
   1014 //        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
   1015 //            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
   1016 //                && fSuffixSet.contains(uc = utext_current32(text))) {
   1017 //                if (uc == KHMER_PAIYANNOI) {
   1018 //                    if (!fSuffixSet.contains(utext_previous32(text))) {
   1019 //                        // Skip over previous end and PAIYANNOI
   1020 //                        utext_next32(text);
   1021 //                        utext_next32(text);
   1022 //                        wordLength += 1;            // Add PAIYANNOI to word
   1023 //                        uc = utext_current32(text);     // Fetch next character
   1024 //                    }
   1025 //                    else {
   1026 //                        // Restore prior position
   1027 //                        utext_next32(text);
   1028 //                    }
   1029 //                }
   1030 //                if (uc == KHMER_MAIYAMOK) {
   1031 //                    if (utext_previous32(text) != KHMER_MAIYAMOK) {
   1032 //                        // Skip over previous end and MAIYAMOK
   1033 //                        utext_next32(text);
   1034 //                        utext_next32(text);
   1035 //                        wordLength += 1;            // Add MAIYAMOK to word
   1036 //                    }
   1037 //                    else {
   1038 //                        // Restore prior position
   1039 //                        utext_next32(text);
   1040 //                    }
   1041 //                }
   1042 //            }
   1043 //            else {
   1044 //                utext_setNativeIndex(text, current+wordLength);
   1045 //            }
   1046 //        }
   1047 
   1048         // Did we find a word on this iteration? If so, push it on the break stack
   1049         if (cuWordLength > 0) {
   1050             foundBreaks.push((current+cuWordLength), status);
   1051         }
   1052     }
   1053 
   1054     // Don't return a break for the end of the dictionary range if there is one there.
   1055     if (foundBreaks.peeki() >= rangeEnd) {
   1056         (void) foundBreaks.popi();
   1057         wordsFound -= 1;
   1058     }
   1059 
   1060     return wordsFound;
   1061 }
   1062 
   1063 #if !UCONFIG_NO_NORMALIZATION
   1064 /*
   1065  ******************************************************************
   1066  * CjkBreakEngine
   1067  */
   1068 static const uint32_t kuint32max = 0xFFFFFFFF;
   1069 CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
   1070 : DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) {
   1071     // Korean dictionary only includes Hangul syllables
   1072     fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
   1073     fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
   1074     fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
   1075     fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
   1076     nfkcNorm2 = Normalizer2::getNFKCInstance(status);
   1077 
   1078     if (U_SUCCESS(status)) {
   1079         // handle Korean and Japanese/Chinese using different dictionaries
   1080         if (type == kKorean) {
   1081             setCharacters(fHangulWordSet);
   1082         } else { //Chinese and Japanese
   1083             UnicodeSet cjSet;
   1084             cjSet.addAll(fHanWordSet);
   1085             cjSet.addAll(fKatakanaWordSet);
   1086             cjSet.addAll(fHiraganaWordSet);
   1087             cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
   1088             cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
   1089             setCharacters(cjSet);
   1090         }
   1091     }
   1092 }
   1093 
   1094 CjkBreakEngine::~CjkBreakEngine(){
   1095     delete fDictionary;
   1096 }
   1097 
   1098 // The katakanaCost values below are based on the length frequencies of all
   1099 // katakana phrases in the dictionary
   1100 static const int32_t kMaxKatakanaLength = 8;
   1101 static const int32_t kMaxKatakanaGroupLength = 20;
   1102 static const uint32_t maxSnlp = 255;
   1103 
   1104 static inline uint32_t getKatakanaCost(int32_t wordLength){
   1105     //TODO: fill array with actual values from dictionary!
   1106     static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
   1107                                        = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
   1108     return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
   1109 }
   1110 
   1111 static inline bool isKatakana(uint16_t value) {
   1112     return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) ||
   1113             (value >= 0xFF66u && value <= 0xFF9fu);
   1114 }
   1115 
   1116 
   1117 // Function for accessing internal utext flags.
   1118 //   Replicates an internal UText function.
   1119 
   1120 static inline int32_t utext_i32_flag(int32_t bitIndex) {
   1121     return (int32_t)1 << bitIndex;
   1122 }
   1123 
   1124 
   1125 /*
   1126  * @param text A UText representing the text
   1127  * @param rangeStart The start of the range of dictionary characters
   1128  * @param rangeEnd The end of the range of dictionary characters
   1129  * @param foundBreaks Output of C array of int32_t break positions, or 0
   1130  * @return The number of breaks found
   1131  */
   1132 int32_t
   1133 CjkBreakEngine::divideUpDictionaryRange( UText *inText,
   1134         int32_t rangeStart,
   1135         int32_t rangeEnd,
   1136         UStack &foundBreaks ) const {
   1137     if (rangeStart >= rangeEnd) {
   1138         return 0;
   1139     }
   1140 
   1141     // UnicodeString version of input UText, NFKC normalized if necessary.
   1142     UnicodeString inString;
   1143 
   1144     // inputMap[inStringIndex] = corresponding native index from UText inText.
   1145     // If NULL then mapping is 1:1
   1146     LocalPointer<UVector32>     inputMap;
   1147 
   1148     UErrorCode     status      = U_ZERO_ERROR;
   1149 
   1150 
   1151     // if UText has the input string as one contiguous UTF-16 chunk
   1152     if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS)) &&
   1153          inText->chunkNativeStart <= rangeStart &&
   1154          inText->chunkNativeLimit >= rangeEnd   &&
   1155          inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) {
   1156 
   1157         // Input UText is in one contiguous UTF-16 chunk.
   1158         // Use Read-only aliasing UnicodeString.
   1159         inString.setTo(FALSE,
   1160                        inText->chunkContents + rangeStart - inText->chunkNativeStart,
   1161                        rangeEnd - rangeStart);
   1162     } else {
   1163         // Copy the text from the original inText (UText) to inString (UnicodeString).
   1164         // Create a map from UnicodeString indices -> UText offsets.
   1165         utext_setNativeIndex(inText, rangeStart);
   1166         int32_t limit = rangeEnd;
   1167         U_ASSERT(limit <= utext_nativeLength(inText));
   1168         if (limit > utext_nativeLength(inText)) {
   1169             limit = utext_nativeLength(inText);
   1170         }
   1171         inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
   1172         if (U_FAILURE(status)) {
   1173             return 0;
   1174         }
   1175         while (utext_getNativeIndex(inText) < limit) {
   1176             int32_t nativePosition = utext_getNativeIndex(inText);
   1177             UChar32 c = utext_next32(inText);
   1178             U_ASSERT(c != U_SENTINEL);
   1179             inString.append(c);
   1180             while (inputMap->size() < inString.length()) {
   1181                 inputMap->addElement(nativePosition, status);
   1182             }
   1183         }
   1184         inputMap->addElement(limit, status);
   1185     }
   1186 
   1187 
   1188     if (!nfkcNorm2->isNormalized(inString, status)) {
   1189         UnicodeString normalizedInput;
   1190         //  normalizedMap[normalizedInput position] ==  original UText position.
   1191         LocalPointer<UVector32> normalizedMap(new UVector32(status), status);
   1192         if (U_FAILURE(status)) {
   1193             return 0;
   1194         }
   1195 
   1196         UnicodeString fragment;
   1197         UnicodeString normalizedFragment;
   1198         for (int32_t srcI = 0; srcI < inString.length();) {  // Once per normalization chunk
   1199             fragment.remove();
   1200             int32_t fragmentStartI = srcI;
   1201             UChar32 c = inString.char32At(srcI);
   1202             for (;;) {
   1203                 fragment.append(c);
   1204                 srcI = inString.moveIndex32(srcI, 1);
   1205                 if (srcI == inString.length()) {
   1206                     break;
   1207                 }
   1208                 c = inString.char32At(srcI);
   1209                 if (nfkcNorm2->hasBoundaryBefore(c)) {
   1210                     break;
   1211                 }
   1212             }
   1213             nfkcNorm2->normalize(fragment, normalizedFragment, status);
   1214             normalizedInput.append(normalizedFragment);
   1215 
   1216             // Map every position in the normalized chunk to the start of the chunk
   1217             //   in the original input.
   1218             int32_t fragmentOriginalStart = inputMap.isValid() ?
   1219                     inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart;
   1220             while (normalizedMap->size() < normalizedInput.length()) {
   1221                 normalizedMap->addElement(fragmentOriginalStart, status);
   1222                 if (U_FAILURE(status)) {
   1223                     break;
   1224                 }
   1225             }
   1226         }
   1227         U_ASSERT(normalizedMap->size() == normalizedInput.length());
   1228         int32_t nativeEnd = inputMap.isValid() ?
   1229                 inputMap->elementAti(inString.length()) : inString.length()+rangeStart;
   1230         normalizedMap->addElement(nativeEnd, status);
   1231 
   1232         inputMap.moveFrom(normalizedMap);
   1233         inString.moveFrom(normalizedInput);
   1234     }
   1235 
   1236     int32_t numCodePts = inString.countChar32();
   1237     if (numCodePts != inString.length()) {
   1238         // There are supplementary characters in the input.
   1239         // The dictionary will produce boundary positions in terms of code point indexes,
   1240         //   not in terms of code unit string indexes.
   1241         // Use the inputMap mechanism to take care of this in addition to indexing differences
   1242         //    from normalization and/or UTF-8 input.
   1243         UBool hadExistingMap = inputMap.isValid();
   1244         if (!hadExistingMap) {
   1245             inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
   1246             if (U_FAILURE(status)) {
   1247                 return 0;
   1248             }
   1249         }
   1250         int32_t cpIdx = 0;
   1251         for (int32_t cuIdx = 0; ; cuIdx = inString.moveIndex32(cuIdx, 1)) {
   1252             U_ASSERT(cuIdx >= cpIdx);
   1253             if (hadExistingMap) {
   1254                 inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx);
   1255             } else {
   1256                 inputMap->addElement(cuIdx+rangeStart, status);
   1257             }
   1258             cpIdx++;
   1259             if (cuIdx == inString.length()) {
   1260                break;
   1261             }
   1262         }
   1263     }
   1264 
   1265     // bestSnlp[i] is the snlp of the best segmentation of the first i
   1266     // code points in the range to be matched.
   1267     UVector32 bestSnlp(numCodePts + 1, status);
   1268     bestSnlp.addElement(0, status);
   1269     for(int32_t i = 1; i <= numCodePts; i++) {
   1270         bestSnlp.addElement(kuint32max, status);
   1271     }
   1272 
   1273 
   1274     // prev[i] is the index of the last CJK code point in the previous word in
   1275     // the best segmentation of the first i characters.
   1276     UVector32 prev(numCodePts + 1, status);
   1277     for(int32_t i = 0; i <= numCodePts; i++){
   1278         prev.addElement(-1, status);
   1279     }
   1280 
   1281     const int32_t maxWordSize = 20;
   1282     UVector32 values(numCodePts, status);
   1283     values.setSize(numCodePts);
   1284     UVector32 lengths(numCodePts, status);
   1285     lengths.setSize(numCodePts);
   1286 
   1287     UText fu = UTEXT_INITIALIZER;
   1288     utext_openUnicodeString(&fu, &inString, &status);
   1289 
   1290     // Dynamic programming to find the best segmentation.
   1291 
   1292     // In outer loop, i  is the code point index,
   1293     //                ix is the corresponding string (code unit) index.
   1294     //    They differ when the string contains supplementary characters.
   1295     int32_t ix = 0;
   1296     for (int32_t i = 0;  i < numCodePts;  ++i, ix = inString.moveIndex32(ix, 1)) {
   1297         if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
   1298             continue;
   1299         }
   1300 
   1301         int32_t count;
   1302         utext_setNativeIndex(&fu, ix);
   1303         count = fDictionary->matches(&fu, maxWordSize, numCodePts,
   1304                              NULL, lengths.getBuffer(), values.getBuffer(), NULL);
   1305                              // Note: lengths is filled with code point lengths
   1306                              //       The NULL parameter is the ignored code unit lengths.
   1307 
   1308         // if there are no single character matches found in the dictionary
   1309         // starting with this charcter, treat character as a 1-character word
   1310         // with the highest value possible, i.e. the least likely to occur.
   1311         // Exclude Korean characters from this treatment, as they should be left
   1312         // together by default.
   1313         if ((count == 0 || lengths.elementAti(0) != 1) &&
   1314                 !fHangulWordSet.contains(inString.char32At(ix))) {
   1315             values.setElementAt(maxSnlp, count);   // 255
   1316             lengths.setElementAt(1, count++);
   1317         }
   1318 
   1319         for (int32_t j = 0; j < count; j++) {
   1320             uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)values.elementAti(j);
   1321             int32_t ln_j_i = lengths.elementAti(j) + i;
   1322             if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) {
   1323                 bestSnlp.setElementAt(newSnlp, ln_j_i);
   1324                 prev.setElementAt(i, ln_j_i);
   1325             }
   1326         }
   1327 
   1328         // In Japanese,
   1329         // Katakana word in single character is pretty rare. So we apply
   1330         // the following heuristic to Katakana: any continuous run of Katakana
   1331         // characters is considered a candidate word with a default cost
   1332         // specified in the katakanaCost table according to its length.
   1333 
   1334         bool is_prev_katakana = false;
   1335         bool is_katakana = isKatakana(inString.char32At(ix));
   1336         int32_t katakanaRunLength = 1;
   1337         if (!is_prev_katakana && is_katakana) {
   1338             int32_t j = inString.moveIndex32(ix, 1);
   1339             // Find the end of the continuous run of Katakana characters
   1340             while (j < inString.length() && katakanaRunLength < kMaxKatakanaGroupLength &&
   1341                     isKatakana(inString.char32At(j))) {
   1342                 j = inString.moveIndex32(j, 1);
   1343                 katakanaRunLength++;
   1344             }
   1345             if (katakanaRunLength < kMaxKatakanaGroupLength) {
   1346                 uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);
   1347                 if (newSnlp < (uint32_t)bestSnlp.elementAti(j)) {
   1348                     bestSnlp.setElementAt(newSnlp, j);
   1349                     prev.setElementAt(i, i+katakanaRunLength);  // prev[j] = i;
   1350                 }
   1351             }
   1352         }
   1353         is_prev_katakana = is_katakana;
   1354     }
   1355     utext_close(&fu);
   1356 
   1357     // Start pushing the optimal offset index into t_boundary (t for tentative).
   1358     // prev[numCodePts] is guaranteed to be meaningful.
   1359     // We'll first push in the reverse order, i.e.,
   1360     // t_boundary[0] = numCodePts, and afterwards do a swap.
   1361     UVector32 t_boundary(numCodePts+1, status);
   1362 
   1363     int32_t numBreaks = 0;
   1364     // No segmentation found, set boundary to end of range
   1365     if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
   1366         t_boundary.addElement(numCodePts, status);
   1367         numBreaks++;
   1368     } else {
   1369         for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) {
   1370             t_boundary.addElement(i, status);
   1371             numBreaks++;
   1372         }
   1373         U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);
   1374     }
   1375 
   1376     // Add a break for the start of the dictionary range if there is not one
   1377     // there already.
   1378     if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
   1379         t_boundary.addElement(0, status);
   1380         numBreaks++;
   1381     }
   1382 
   1383     // Now that we're done, convert positions in t_boundary[] (indices in
   1384     // the normalized input string) back to indices in the original input UText
   1385     // while reversing t_boundary and pushing values to foundBreaks.
   1386     for (int32_t i = numBreaks-1; i >= 0; i--) {
   1387         int32_t cpPos = t_boundary.elementAti(i);
   1388         int32_t utextPos =  inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
   1389         // Boundaries are added to foundBreaks output in ascending order.
   1390         U_ASSERT(foundBreaks.size() == 0 ||foundBreaks.peeki() < utextPos);
   1391         foundBreaks.push(utextPos, status);
   1392     }
   1393 
   1394     // inString goes out of scope
   1395     // inputMap goes out of scope
   1396     return numBreaks;
   1397 }
   1398 #endif
   1399 
   1400 U_NAMESPACE_END
   1401 
   1402 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
   1403 
   1404