Home | History | Annotate | Download | only in common
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /**
      4  *******************************************************************************
      5  * Copyright (C) 2006-2016, International Business Machines Corporation
      6  * and others. All Rights Reserved.
      7  *******************************************************************************
      8  */
      9 
     10 #include "unicode/utypes.h"
     11 
     12 #if !UCONFIG_NO_BREAK_ITERATION
     13 
     14 #include "brkeng.h"
     15 #include "dictbe.h"
     16 #include "unicode/uniset.h"
     17 #include "unicode/chariter.h"
     18 #include "unicode/ubrk.h"
     19 #include "uvectr32.h"
     20 #include "uvector.h"
     21 #include "uassert.h"
     22 #include "unicode/normlzr.h"
     23 #include "cmemory.h"
     24 #include "dictionarydata.h"
     25 
     26 U_NAMESPACE_BEGIN
     27 
     28 /*
     29  ******************************************************************
     30  */
     31 
     32 DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) {
     33     fTypes = breakTypes;
     34 }
     35 
     36 DictionaryBreakEngine::~DictionaryBreakEngine() {
     37 }
     38 
     39 UBool
     40 DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const {
     41     return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)
     42             && fSet.contains(c));
     43 }
     44 
     45 int32_t
     46 DictionaryBreakEngine::findBreaks( UText *text,
     47                                  int32_t startPos,
     48                                  int32_t endPos,
     49                                  int32_t breakType,
     50                                  UVector32 &foundBreaks ) const {
     51     (void)startPos;            // TODO: remove this param?
     52     int32_t result = 0;
     53 
     54     // Find the span of characters included in the set.
     55     //   The span to break begins at the current position in the text, and
     56     //   extends towards the start or end of the text, depending on 'reverse'.
     57 
     58     int32_t start = (int32_t)utext_getNativeIndex(text);
     59     int32_t current;
     60     int32_t rangeStart;
     61     int32_t rangeEnd;
     62     UChar32 c = utext_current32(text);
     63     while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
     64         utext_next32(text);         // TODO:  recast loop for postincrement
     65         c = utext_current32(text);
     66     }
     67     rangeStart = start;
     68     rangeEnd = current;
     69     if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
     70         result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
     71         utext_setNativeIndex(text, current);
     72     }
     73 
     74     return result;
     75 }
     76 
     77 void
     78 DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
     79     fSet = set;
     80     // Compact for caching
     81     fSet.compact();
     82 }
     83 
     84 /*
     85  ******************************************************************
     86  * PossibleWord
     87  */
     88 
     89 // Helper class for improving readability of the Thai/Lao/Khmer word break
     90 // algorithm. The implementation is completely inline.
     91 
     92 // List size, limited by the maximum number of words in the dictionary
     93 // that form a nested sequence.
     94 static const int32_t POSSIBLE_WORD_LIST_MAX = 20;
     95 
     96 class PossibleWord {
     97 private:
     98     // list of word candidate lengths, in increasing length order
     99     // TODO: bytes would be sufficient for word lengths.
    100     int32_t   count;      // Count of candidates
    101     int32_t   prefix;     // The longest match with a dictionary word
    102     int32_t   offset;     // Offset in the text of these candidates
    103     int32_t   mark;       // The preferred candidate's offset
    104     int32_t   current;    // The candidate we're currently looking at
    105     int32_t   cuLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code units.
    106     int32_t   cpLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code points.
    107 
    108 public:
    109     PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {};
    110     ~PossibleWord() {};
    111 
    112     // Fill the list of candidates if needed, select the longest, and return the number found
    113     int32_t   candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
    114 
    115     // Select the currently marked candidate, point after it in the text, and invalidate self
    116     int32_t   acceptMarked( UText *text );
    117 
    118     // Back up from the current candidate to the next shorter one; return TRUE if that exists
    119     // and point the text after it
    120     UBool     backUp( UText *text );
    121 
    122     // Return the longest prefix this candidate location shares with a dictionary word
    123     // Return value is in code points.
    124     int32_t   longestPrefix() { return prefix; };
    125 
    126     // Mark the current candidate as the one we like
    127     void      markCurrent() { mark = current; };
    128 
    129     // Get length in code points of the marked word.
    130     int32_t   markedCPLength() { return cpLengths[mark]; };
    131 };
    132 
    133 
    134 int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
    135     // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
    136     int32_t start = (int32_t)utext_getNativeIndex(text);
    137     if (start != offset) {
    138         offset = start;
    139         count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix);
    140         // Dictionary leaves text after longest prefix, not longest word. Back up.
    141         if (count <= 0) {
    142             utext_setNativeIndex(text, start);
    143         }
    144     }
    145     if (count > 0) {
    146         utext_setNativeIndex(text, start+cuLengths[count-1]);
    147     }
    148     current = count-1;
    149     mark = current;
    150     return count;
    151 }
    152 
    153 int32_t
    154 PossibleWord::acceptMarked( UText *text ) {
    155     utext_setNativeIndex(text, offset + cuLengths[mark]);
    156     return cuLengths[mark];
    157 }
    158 
    159 
    160 UBool
    161 PossibleWord::backUp( UText *text ) {
    162     if (current > 0) {
    163         utext_setNativeIndex(text, offset + cuLengths[--current]);
    164         return TRUE;
    165     }
    166     return FALSE;
    167 }
    168 
    169 /*
    170  ******************************************************************
    171  * ThaiBreakEngine
    172  */
    173 
    174 // How many words in a row are "good enough"?
    175 static const int32_t THAI_LOOKAHEAD = 3;
    176 
    177 // Will not combine a non-word with a preceding dictionary word longer than this
    178 static const int32_t THAI_ROOT_COMBINE_THRESHOLD = 3;
    179 
    180 // Will not combine a non-word that shares at least this much prefix with a
    181 // dictionary word, with a preceding word
    182 static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = 3;
    183 
    184 // Ellision character
    185 static const int32_t THAI_PAIYANNOI = 0x0E2F;
    186 
    187 // Repeat character
    188 static const int32_t THAI_MAIYAMOK = 0x0E46;
    189 
    190 // Minimum word size
    191 static const int32_t THAI_MIN_WORD = 2;
    192 
    193 // Minimum number of characters for two words
    194 static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
    195 
    196 ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
    197     : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
    198       fDictionary(adoptDictionary)
    199 {
    200     fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
    201     if (U_SUCCESS(status)) {
    202         setCharacters(fThaiWordSet);
    203     }
    204     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
    205     fMarkSet.add(0x0020);
    206     fEndWordSet = fThaiWordSet;
    207     fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
    208     fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
    209     fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
    210     fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
    211     fSuffixSet.add(THAI_PAIYANNOI);
    212     fSuffixSet.add(THAI_MAIYAMOK);
    213 
    214     // Compact for caching.
    215     fMarkSet.compact();
    216     fEndWordSet.compact();
    217     fBeginWordSet.compact();
    218     fSuffixSet.compact();
    219 }
    220 
    221 ThaiBreakEngine::~ThaiBreakEngine() {
    222     delete fDictionary;
    223 }
    224 
    225 int32_t
    226 ThaiBreakEngine::divideUpDictionaryRange( UText *text,
    227                                                 int32_t rangeStart,
    228                                                 int32_t rangeEnd,
    229                                                 UVector32 &foundBreaks ) const {
    230     utext_setNativeIndex(text, rangeStart);
    231     utext_moveIndex32(text, THAI_MIN_WORD_SPAN);
    232     if (utext_getNativeIndex(text) >= rangeEnd) {
    233         return 0;       // Not enough characters for two words
    234     }
    235     utext_setNativeIndex(text, rangeStart);
    236 
    237 
    238     uint32_t wordsFound = 0;
    239     int32_t cpWordLength = 0;    // Word Length in Code Points.
    240     int32_t cuWordLength = 0;    // Word length in code units (UText native indexing)
    241     int32_t current;
    242     UErrorCode status = U_ZERO_ERROR;
    243     PossibleWord words[THAI_LOOKAHEAD];
    244 
    245     utext_setNativeIndex(text, rangeStart);
    246 
    247     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
    248         cpWordLength = 0;
    249         cuWordLength = 0;
    250 
    251         // Look for candidate words at the current position
    252         int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    253 
    254         // If we found exactly one, use that
    255         if (candidates == 1) {
    256             cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
    257             cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
    258             wordsFound += 1;
    259         }
    260         // If there was more than one, see which one can take us forward the most words
    261         else if (candidates > 1) {
    262             // If we're already at the end of the range, we're done
    263             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
    264                 goto foundBest;
    265             }
    266             do {
    267                 int32_t wordsMatched = 1;
    268                 if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
    269                     if (wordsMatched < 2) {
    270                         // Followed by another dictionary word; mark first word as a good candidate
    271                         words[wordsFound%THAI_LOOKAHEAD].markCurrent();
    272                         wordsMatched = 2;
    273                     }
    274 
    275                     // If we're already at the end of the range, we're done
    276                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
    277                         goto foundBest;
    278                     }
    279 
    280                     // See if any of the possible second words is followed by a third word
    281                     do {
    282                         // If we find a third word, stop right away
    283                         if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
    284                             words[wordsFound % THAI_LOOKAHEAD].markCurrent();
    285                             goto foundBest;
    286                         }
    287                     }
    288                     while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));
    289                 }
    290             }
    291             while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
    292 foundBest:
    293             // Set UText position to after the accepted word.
    294             cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
    295             cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
    296             wordsFound += 1;
    297         }
    298 
    299         // We come here after having either found a word or not. We look ahead to the
    300         // next word. If it's not a dictionary word, we will combine it with the word we
    301         // just found (if there is one), but only if the preceding word does not exceed
    302         // the threshold.
    303         // The text iterator should now be positioned at the end of the word we found.
    304 
    305         UChar32 uc = 0;
    306         if ((int32_t)utext_getNativeIndex(text) < rangeEnd &&  cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) {
    307             // if it is a dictionary word, do nothing. If it isn't, then if there is
    308             // no preceding word, or the non-word shares less than the minimum threshold
    309             // of characters with a dictionary word, then scan to resynchronize
    310             if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
    311                   && (cuWordLength == 0
    312                       || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
    313                 // Look for a plausible word boundary
    314                 int32_t remaining = rangeEnd - (current+cuWordLength);
    315                 UChar32 pc;
    316                 int32_t chars = 0;
    317                 for (;;) {
    318                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
    319                     pc = utext_next32(text);
    320                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
    321                     chars += pcSize;
    322                     remaining -= pcSize;
    323                     if (remaining <= 0) {
    324                         break;
    325                     }
    326                     uc = utext_current32(text);
    327                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
    328                         // Maybe. See if it's in the dictionary.
    329                         // NOTE: In the original Apple code, checked that the next
    330                         // two characters after uc were not 0x0E4C THANTHAKHAT before
    331                         // checking the dictionary. That is just a performance filter,
    332                         // but it's not clear it's faster than checking the trie.
    333                         int32_t candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    334                         utext_setNativeIndex(text, current + cuWordLength + chars);
    335                         if (candidates > 0) {
    336                             break;
    337                         }
    338                     }
    339                 }
    340 
    341                 // Bump the word count if there wasn't already one
    342                 if (cuWordLength <= 0) {
    343                     wordsFound += 1;
    344                 }
    345 
    346                 // Update the length with the passed-over characters
    347                 cuWordLength += chars;
    348             }
    349             else {
    350                 // Back up to where we were for next iteration
    351                 utext_setNativeIndex(text, current+cuWordLength);
    352             }
    353         }
    354 
    355         // Never stop before a combining mark.
    356         int32_t currPos;
    357         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
    358             utext_next32(text);
    359             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
    360         }
    361 
    362         // Look ahead for possible suffixes if a dictionary word does not follow.
    363         // We do this in code rather than using a rule so that the heuristic
    364         // resynch continues to function. For example, one of the suffix characters
    365         // could be a typo in the middle of a word.
    366         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cuWordLength > 0) {
    367             if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
    368                 && fSuffixSet.contains(uc = utext_current32(text))) {
    369                 if (uc == THAI_PAIYANNOI) {
    370                     if (!fSuffixSet.contains(utext_previous32(text))) {
    371                         // Skip over previous end and PAIYANNOI
    372                         utext_next32(text);
    373                         int32_t paiyannoiIndex = (int32_t)utext_getNativeIndex(text);
    374                         utext_next32(text);
    375                         cuWordLength += (int32_t)utext_getNativeIndex(text) - paiyannoiIndex;    // Add PAIYANNOI to word
    376                         uc = utext_current32(text);     // Fetch next character
    377                     }
    378                     else {
    379                         // Restore prior position
    380                         utext_next32(text);
    381                     }
    382                 }
    383                 if (uc == THAI_MAIYAMOK) {
    384                     if (utext_previous32(text) != THAI_MAIYAMOK) {
    385                         // Skip over previous end and MAIYAMOK
    386                         utext_next32(text);
    387                         int32_t maiyamokIndex = (int32_t)utext_getNativeIndex(text);
    388                         utext_next32(text);
    389                         cuWordLength += (int32_t)utext_getNativeIndex(text) - maiyamokIndex;    // Add MAIYAMOK to word
    390                     }
    391                     else {
    392                         // Restore prior position
    393                         utext_next32(text);
    394                     }
    395                 }
    396             }
    397             else {
    398                 utext_setNativeIndex(text, current+cuWordLength);
    399             }
    400         }
    401 
    402         // Did we find a word on this iteration? If so, push it on the break stack
    403         if (cuWordLength > 0) {
    404             foundBreaks.push((current+cuWordLength), status);
    405         }
    406     }
    407 
    408     // Don't return a break for the end of the dictionary range if there is one there.
    409     if (foundBreaks.peeki() >= rangeEnd) {
    410         (void) foundBreaks.popi();
    411         wordsFound -= 1;
    412     }
    413 
    414     return wordsFound;
    415 }
    416 
    417 /*
    418  ******************************************************************
    419  * LaoBreakEngine
    420  */
    421 
    422 // How many words in a row are "good enough"?
    423 static const int32_t LAO_LOOKAHEAD = 3;
    424 
    425 // Will not combine a non-word with a preceding dictionary word longer than this
    426 static const int32_t LAO_ROOT_COMBINE_THRESHOLD = 3;
    427 
    428 // Will not combine a non-word that shares at least this much prefix with a
    429 // dictionary word, with a preceding word
    430 static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = 3;
    431 
    432 // Minimum word size
    433 static const int32_t LAO_MIN_WORD = 2;
    434 
    435 // Minimum number of characters for two words
    436 static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2;
    437 
    438 LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
    439     : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
    440       fDictionary(adoptDictionary)
    441 {
    442     fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
    443     if (U_SUCCESS(status)) {
    444         setCharacters(fLaoWordSet);
    445     }
    446     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
    447     fMarkSet.add(0x0020);
    448     fEndWordSet = fLaoWordSet;
    449     fEndWordSet.remove(0x0EC0, 0x0EC4);     // prefix vowels
    450     fBeginWordSet.add(0x0E81, 0x0EAE);      // basic consonants (including holes for corresponding Thai characters)
    451     fBeginWordSet.add(0x0EDC, 0x0EDD);      // digraph consonants (no Thai equivalent)
    452     fBeginWordSet.add(0x0EC0, 0x0EC4);      // prefix vowels
    453 
    454     // Compact for caching.
    455     fMarkSet.compact();
    456     fEndWordSet.compact();
    457     fBeginWordSet.compact();
    458 }
    459 
    460 LaoBreakEngine::~LaoBreakEngine() {
    461     delete fDictionary;
    462 }
    463 
    464 int32_t
    465 LaoBreakEngine::divideUpDictionaryRange( UText *text,
    466                                                 int32_t rangeStart,
    467                                                 int32_t rangeEnd,
    468                                                 UVector32 &foundBreaks ) const {
    469     if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
    470         return 0;       // Not enough characters for two words
    471     }
    472 
    473     uint32_t wordsFound = 0;
    474     int32_t cpWordLength = 0;
    475     int32_t cuWordLength = 0;
    476     int32_t current;
    477     UErrorCode status = U_ZERO_ERROR;
    478     PossibleWord words[LAO_LOOKAHEAD];
    479 
    480     utext_setNativeIndex(text, rangeStart);
    481 
    482     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
    483         cuWordLength = 0;
    484         cpWordLength = 0;
    485 
    486         // Look for candidate words at the current position
    487         int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    488 
    489         // If we found exactly one, use that
    490         if (candidates == 1) {
    491             cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
    492             cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
    493             wordsFound += 1;
    494         }
    495         // If there was more than one, see which one can take us forward the most words
    496         else if (candidates > 1) {
    497             // If we're already at the end of the range, we're done
    498             if (utext_getNativeIndex(text) >= rangeEnd) {
    499                 goto foundBest;
    500             }
    501             do {
    502                 int32_t wordsMatched = 1;
    503                 if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
    504                     if (wordsMatched < 2) {
    505                         // Followed by another dictionary word; mark first word as a good candidate
    506                         words[wordsFound%LAO_LOOKAHEAD].markCurrent();
    507                         wordsMatched = 2;
    508                     }
    509 
    510                     // If we're already at the end of the range, we're done
    511                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
    512                         goto foundBest;
    513                     }
    514 
    515                     // See if any of the possible second words is followed by a third word
    516                     do {
    517                         // If we find a third word, stop right away
    518                         if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
    519                             words[wordsFound % LAO_LOOKAHEAD].markCurrent();
    520                             goto foundBest;
    521                         }
    522                     }
    523                     while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));
    524                 }
    525             }
    526             while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
    527 foundBest:
    528             cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
    529             cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
    530             wordsFound += 1;
    531         }
    532 
    533         // We come here after having either found a word or not. We look ahead to the
    534         // next word. If it's not a dictionary word, we will combine it withe the word we
    535         // just found (if there is one), but only if the preceding word does not exceed
    536         // the threshold.
    537         // The text iterator should now be positioned at the end of the word we found.
    538         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < LAO_ROOT_COMBINE_THRESHOLD) {
    539             // if it is a dictionary word, do nothing. If it isn't, then if there is
    540             // no preceding word, or the non-word shares less than the minimum threshold
    541             // of characters with a dictionary word, then scan to resynchronize
    542             if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
    543                   && (cuWordLength == 0
    544                       || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
    545                 // Look for a plausible word boundary
    546                 int32_t remaining = rangeEnd - (current + cuWordLength);
    547                 UChar32 pc;
    548                 UChar32 uc;
    549                 int32_t chars = 0;
    550                 for (;;) {
    551                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
    552                     pc = utext_next32(text);
    553                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
    554                     chars += pcSize;
    555                     remaining -= pcSize;
    556                     if (remaining <= 0) {
    557                         break;
    558                     }
    559                     uc = utext_current32(text);
    560                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
    561                         // Maybe. See if it's in the dictionary.
    562                         // TODO: this looks iffy; compare with old code.
    563                         int32_t candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    564                         utext_setNativeIndex(text, current + cuWordLength + chars);
    565                         if (candidates > 0) {
    566                             break;
    567                         }
    568                     }
    569                 }
    570 
    571                 // Bump the word count if there wasn't already one
    572                 if (cuWordLength <= 0) {
    573                     wordsFound += 1;
    574                 }
    575 
    576                 // Update the length with the passed-over characters
    577                 cuWordLength += chars;
    578             }
    579             else {
    580                 // Back up to where we were for next iteration
    581                 utext_setNativeIndex(text, current + cuWordLength);
    582             }
    583         }
    584 
    585         // Never stop before a combining mark.
    586         int32_t currPos;
    587         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
    588             utext_next32(text);
    589             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
    590         }
    591 
    592         // Look ahead for possible suffixes if a dictionary word does not follow.
    593         // We do this in code rather than using a rule so that the heuristic
    594         // resynch continues to function. For example, one of the suffix characters
    595         // could be a typo in the middle of a word.
    596         // NOT CURRENTLY APPLICABLE TO LAO
    597 
    598         // Did we find a word on this iteration? If so, push it on the break stack
    599         if (cuWordLength > 0) {
    600             foundBreaks.push((current+cuWordLength), status);
    601         }
    602     }
    603 
    604     // Don't return a break for the end of the dictionary range if there is one there.
    605     if (foundBreaks.peeki() >= rangeEnd) {
    606         (void) foundBreaks.popi();
    607         wordsFound -= 1;
    608     }
    609 
    610     return wordsFound;
    611 }
    612 
    613 /*
    614  ******************************************************************
    615  * BurmeseBreakEngine
    616  */
    617 
    618 // How many words in a row are "good enough"?
    619 static const int32_t BURMESE_LOOKAHEAD = 3;
    620 
    621 // Will not combine a non-word with a preceding dictionary word longer than this
    622 static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3;
    623 
    624 // Will not combine a non-word that shares at least this much prefix with a
    625 // dictionary word, with a preceding word
    626 static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3;
    627 
    628 // Minimum word size
    629 static const int32_t BURMESE_MIN_WORD = 2;
    630 
    631 // Minimum number of characters for two words
    632 static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2;
    633 
    634 BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
    635     : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
    636       fDictionary(adoptDictionary)
    637 {
    638     fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
    639     if (U_SUCCESS(status)) {
    640         setCharacters(fBurmeseWordSet);
    641     }
    642     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
    643     fMarkSet.add(0x0020);
    644     fEndWordSet = fBurmeseWordSet;
    645     fBeginWordSet.add(0x1000, 0x102A);      // basic consonants and independent vowels
    646 
    647     // Compact for caching.
    648     fMarkSet.compact();
    649     fEndWordSet.compact();
    650     fBeginWordSet.compact();
    651 }
    652 
    653 BurmeseBreakEngine::~BurmeseBreakEngine() {
    654     delete fDictionary;
    655 }
    656 
    657 int32_t
    658 BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
    659                                                 int32_t rangeStart,
    660                                                 int32_t rangeEnd,
    661                                                 UVector32 &foundBreaks ) const {
    662     if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
    663         return 0;       // Not enough characters for two words
    664     }
    665 
    666     uint32_t wordsFound = 0;
    667     int32_t cpWordLength = 0;
    668     int32_t cuWordLength = 0;
    669     int32_t current;
    670     UErrorCode status = U_ZERO_ERROR;
    671     PossibleWord words[BURMESE_LOOKAHEAD];
    672 
    673     utext_setNativeIndex(text, rangeStart);
    674 
    675     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
    676         cuWordLength = 0;
    677         cpWordLength = 0;
    678 
    679         // Look for candidate words at the current position
    680         int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    681 
    682         // If we found exactly one, use that
    683         if (candidates == 1) {
    684             cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
    685             cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
    686             wordsFound += 1;
    687         }
    688         // If there was more than one, see which one can take us forward the most words
    689         else if (candidates > 1) {
    690             // If we're already at the end of the range, we're done
    691             if (utext_getNativeIndex(text) >= rangeEnd) {
    692                 goto foundBest;
    693             }
    694             do {
    695                 int32_t wordsMatched = 1;
    696                 if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
    697                     if (wordsMatched < 2) {
    698                         // Followed by another dictionary word; mark first word as a good candidate
    699                         words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
    700                         wordsMatched = 2;
    701                     }
    702 
    703                     // If we're already at the end of the range, we're done
    704                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
    705                         goto foundBest;
    706                     }
    707 
    708                     // See if any of the possible second words is followed by a third word
    709                     do {
    710                         // If we find a third word, stop right away
    711                         if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
    712                             words[wordsFound % BURMESE_LOOKAHEAD].markCurrent();
    713                             goto foundBest;
    714                         }
    715                     }
    716                     while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(text));
    717                 }
    718             }
    719             while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text));
    720 foundBest:
    721             cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
    722             cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
    723             wordsFound += 1;
    724         }
    725 
    726         // We come here after having either found a word or not. We look ahead to the
    727         // next word. If it's not a dictionary word, we will combine it withe the word we
    728         // just found (if there is one), but only if the preceding word does not exceed
    729         // the threshold.
    730         // The text iterator should now be positioned at the end of the word we found.
    731         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {
    732             // if it is a dictionary word, do nothing. If it isn't, then if there is
    733             // no preceding word, or the non-word shares less than the minimum threshold
    734             // of characters with a dictionary word, then scan to resynchronize
    735             if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
    736                   && (cuWordLength == 0
    737                       || words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {
    738                 // Look for a plausible word boundary
    739                 int32_t remaining = rangeEnd - (current + cuWordLength);
    740                 UChar32 pc;
    741                 UChar32 uc;
    742                 int32_t chars = 0;
    743                 for (;;) {
    744                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
    745                     pc = utext_next32(text);
    746                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
    747                     chars += pcSize;
    748                     remaining -= pcSize;
    749                     if (remaining <= 0) {
    750                         break;
    751                     }
    752                     uc = utext_current32(text);
    753                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
    754                         // Maybe. See if it's in the dictionary.
    755                         // TODO: this looks iffy; compare with old code.
    756                         int32_t candidates = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    757                         utext_setNativeIndex(text, current + cuWordLength + chars);
    758                         if (candidates > 0) {
    759                             break;
    760                         }
    761                     }
    762                 }
    763 
    764                 // Bump the word count if there wasn't already one
    765                 if (cuWordLength <= 0) {
    766                     wordsFound += 1;
    767                 }
    768 
    769                 // Update the length with the passed-over characters
    770                 cuWordLength += chars;
    771             }
    772             else {
    773                 // Back up to where we were for next iteration
    774                 utext_setNativeIndex(text, current + cuWordLength);
    775             }
    776         }
    777 
    778         // Never stop before a combining mark.
    779         int32_t currPos;
    780         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
    781             utext_next32(text);
    782             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
    783         }
    784 
    785         // Look ahead for possible suffixes if a dictionary word does not follow.
    786         // We do this in code rather than using a rule so that the heuristic
    787         // resynch continues to function. For example, one of the suffix characters
    788         // could be a typo in the middle of a word.
    789         // NOT CURRENTLY APPLICABLE TO BURMESE
    790 
    791         // Did we find a word on this iteration? If so, push it on the break stack
    792         if (cuWordLength > 0) {
    793             foundBreaks.push((current+cuWordLength), status);
    794         }
    795     }
    796 
    797     // Don't return a break for the end of the dictionary range if there is one there.
    798     if (foundBreaks.peeki() >= rangeEnd) {
    799         (void) foundBreaks.popi();
    800         wordsFound -= 1;
    801     }
    802 
    803     return wordsFound;
    804 }
    805 
    806 /*
    807  ******************************************************************
    808  * KhmerBreakEngine
    809  */
    810 
    811 // How many words in a row are "good enough"?
    812 static const int32_t KHMER_LOOKAHEAD = 3;
    813 
    814 // Will not combine a non-word with a preceding dictionary word longer than this
    815 static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;
    816 
    817 // Will not combine a non-word that shares at least this much prefix with a
    818 // dictionary word, with a preceding word
    819 static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;
    820 
    821 // Minimum word size
    822 static const int32_t KHMER_MIN_WORD = 2;
    823 
    824 // Minimum number of characters for two words
    825 static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
    826 
    827 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
    828     : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)),
    829       fDictionary(adoptDictionary)
    830 {
    831     fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
    832     if (U_SUCCESS(status)) {
    833         setCharacters(fKhmerWordSet);
    834     }
    835     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
    836     fMarkSet.add(0x0020);
    837     fEndWordSet = fKhmerWordSet;
    838     fBeginWordSet.add(0x1780, 0x17B3);
    839     //fBeginWordSet.add(0x17A3, 0x17A4);      // deprecated vowels
    840     //fEndWordSet.remove(0x17A5, 0x17A9);     // Khmer independent vowels that can't end a word
    841     //fEndWordSet.remove(0x17B2);             // Khmer independent vowel that can't end a word
    842     fEndWordSet.remove(0x17D2);             // KHMER SIGN COENG that combines some following characters
    843     //fEndWordSet.remove(0x17B6, 0x17C5);     // Remove dependent vowels
    844 //    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
    845 //    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
    846 //    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
    847 //    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
    848 //    fSuffixSet.add(THAI_PAIYANNOI);
    849 //    fSuffixSet.add(THAI_MAIYAMOK);
    850 
    851     // Compact for caching.
    852     fMarkSet.compact();
    853     fEndWordSet.compact();
    854     fBeginWordSet.compact();
    855 //    fSuffixSet.compact();
    856 }
    857 
    858 KhmerBreakEngine::~KhmerBreakEngine() {
    859     delete fDictionary;
    860 }
    861 
    862 int32_t
    863 KhmerBreakEngine::divideUpDictionaryRange( UText *text,
    864                                                 int32_t rangeStart,
    865                                                 int32_t rangeEnd,
    866                                                 UVector32 &foundBreaks ) const {
    867     if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
    868         return 0;       // Not enough characters for two words
    869     }
    870 
    871     uint32_t wordsFound = 0;
    872     int32_t cpWordLength = 0;
    873     int32_t cuWordLength = 0;
    874     int32_t current;
    875     UErrorCode status = U_ZERO_ERROR;
    876     PossibleWord words[KHMER_LOOKAHEAD];
    877 
    878     utext_setNativeIndex(text, rangeStart);
    879 
    880     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
    881         cuWordLength = 0;
    882         cpWordLength = 0;
    883 
    884         // Look for candidate words at the current position
    885         int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    886 
    887         // If we found exactly one, use that
    888         if (candidates == 1) {
    889             cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
    890             cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
    891             wordsFound += 1;
    892         }
    893 
    894         // If there was more than one, see which one can take us forward the most words
    895         else if (candidates > 1) {
    896             // If we're already at the end of the range, we're done
    897             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
    898                 goto foundBest;
    899             }
    900             do {
    901                 int32_t wordsMatched = 1;
    902                 if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
    903                     if (wordsMatched < 2) {
    904                         // Followed by another dictionary word; mark first word as a good candidate
    905                         words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
    906                         wordsMatched = 2;
    907                     }
    908 
    909                     // If we're already at the end of the range, we're done
    910                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
    911                         goto foundBest;
    912                     }
    913 
    914                     // See if any of the possible second words is followed by a third word
    915                     do {
    916                         // If we find a third word, stop right away
    917                         if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
    918                             words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
    919                             goto foundBest;
    920                         }
    921                     }
    922                     while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
    923                 }
    924             }
    925             while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
    926 foundBest:
    927             cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
    928             cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
    929             wordsFound += 1;
    930         }
    931 
    932         // We come here after having either found a word or not. We look ahead to the
    933         // next word. If it's not a dictionary word, we will combine it with the word we
    934         // just found (if there is one), but only if the preceding word does not exceed
    935         // the threshold.
    936         // The text iterator should now be positioned at the end of the word we found.
    937         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
    938             // if it is a dictionary word, do nothing. If it isn't, then if there is
    939             // no preceding word, or the non-word shares less than the minimum threshold
    940             // of characters with a dictionary word, then scan to resynchronize
    941             if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
    942                   && (cuWordLength == 0
    943                       || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
    944                 // Look for a plausible word boundary
    945                 int32_t remaining = rangeEnd - (current+cuWordLength);
    946                 UChar32 pc;
    947                 UChar32 uc;
    948                 int32_t chars = 0;
    949                 for (;;) {
    950                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
    951                     pc = utext_next32(text);
    952                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
    953                     chars += pcSize;
    954                     remaining -= pcSize;
    955                     if (remaining <= 0) {
    956                         break;
    957                     }
    958                     uc = utext_current32(text);
    959                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
    960                         // Maybe. See if it's in the dictionary.
    961                         int32_t candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    962                         utext_setNativeIndex(text, current+cuWordLength+chars);
    963                         if (candidates > 0) {
    964                             break;
    965                         }
    966                     }
    967                 }
    968 
    969                 // Bump the word count if there wasn't already one
    970                 if (cuWordLength <= 0) {
    971                     wordsFound += 1;
    972                 }
    973 
    974                 // Update the length with the passed-over characters
    975                 cuWordLength += chars;
    976             }
    977             else {
    978                 // Back up to where we were for next iteration
    979                 utext_setNativeIndex(text, current+cuWordLength);
    980             }
    981         }
    982 
    983         // Never stop before a combining mark.
    984         int32_t currPos;
    985         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
    986             utext_next32(text);
    987             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
    988         }
    989 
    990         // Look ahead for possible suffixes if a dictionary word does not follow.
    991         // We do this in code rather than using a rule so that the heuristic
    992         // resynch continues to function. For example, one of the suffix characters
    993         // could be a typo in the middle of a word.
    994 //        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
    995 //            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
    996 //                && fSuffixSet.contains(uc = utext_current32(text))) {
    997 //                if (uc == KHMER_PAIYANNOI) {
    998 //                    if (!fSuffixSet.contains(utext_previous32(text))) {
    999 //                        // Skip over previous end and PAIYANNOI
   1000 //                        utext_next32(text);
   1001 //                        utext_next32(text);
   1002 //                        wordLength += 1;            // Add PAIYANNOI to word
   1003 //                        uc = utext_current32(text);     // Fetch next character
   1004 //                    }
   1005 //                    else {
   1006 //                        // Restore prior position
   1007 //                        utext_next32(text);
   1008 //                    }
   1009 //                }
   1010 //                if (uc == KHMER_MAIYAMOK) {
   1011 //                    if (utext_previous32(text) != KHMER_MAIYAMOK) {
   1012 //                        // Skip over previous end and MAIYAMOK
   1013 //                        utext_next32(text);
   1014 //                        utext_next32(text);
   1015 //                        wordLength += 1;            // Add MAIYAMOK to word
   1016 //                    }
   1017 //                    else {
   1018 //                        // Restore prior position
   1019 //                        utext_next32(text);
   1020 //                    }
   1021 //                }
   1022 //            }
   1023 //            else {
   1024 //                utext_setNativeIndex(text, current+wordLength);
   1025 //            }
   1026 //        }
   1027 
   1028         // Did we find a word on this iteration? If so, push it on the break stack
   1029         if (cuWordLength > 0) {
   1030             foundBreaks.push((current+cuWordLength), status);
   1031         }
   1032     }
   1033 
   1034     // Don't return a break for the end of the dictionary range if there is one there.
   1035     if (foundBreaks.peeki() >= rangeEnd) {
   1036         (void) foundBreaks.popi();
   1037         wordsFound -= 1;
   1038     }
   1039 
   1040     return wordsFound;
   1041 }
   1042 
   1043 #if !UCONFIG_NO_NORMALIZATION
   1044 /*
   1045  ******************************************************************
   1046  * CjkBreakEngine
   1047  */
   1048 static const uint32_t kuint32max = 0xFFFFFFFF;
   1049 CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
   1050 : DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) {
   1051     // Korean dictionary only includes Hangul syllables
   1052     fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
   1053     fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
   1054     fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
   1055     fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
   1056     nfkcNorm2 = Normalizer2::getNFKCInstance(status);
   1057 
   1058     if (U_SUCCESS(status)) {
   1059         // handle Korean and Japanese/Chinese using different dictionaries
   1060         if (type == kKorean) {
   1061             setCharacters(fHangulWordSet);
   1062         } else { //Chinese and Japanese
   1063             UnicodeSet cjSet;
   1064             cjSet.addAll(fHanWordSet);
   1065             cjSet.addAll(fKatakanaWordSet);
   1066             cjSet.addAll(fHiraganaWordSet);
   1067             cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
   1068             cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
   1069             setCharacters(cjSet);
   1070         }
   1071     }
   1072 }
   1073 
   1074 CjkBreakEngine::~CjkBreakEngine(){
   1075     delete fDictionary;
   1076 }
   1077 
   1078 // The katakanaCost values below are based on the length frequencies of all
   1079 // katakana phrases in the dictionary
   1080 static const int32_t kMaxKatakanaLength = 8;
   1081 static const int32_t kMaxKatakanaGroupLength = 20;
   1082 static const uint32_t maxSnlp = 255;
   1083 
   1084 static inline uint32_t getKatakanaCost(int32_t wordLength){
   1085     //TODO: fill array with actual values from dictionary!
   1086     static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
   1087                                        = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
   1088     return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
   1089 }
   1090 
   1091 static inline bool isKatakana(UChar32 value) {
   1092     return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) ||
   1093             (value >= 0xFF66 && value <= 0xFF9f);
   1094 }
   1095 
   1096 
   1097 // Function for accessing internal utext flags.
   1098 //   Replicates an internal UText function.
   1099 
   1100 static inline int32_t utext_i32_flag(int32_t bitIndex) {
   1101     return (int32_t)1 << bitIndex;
   1102 }
   1103 
   1104 
   1105 /*
   1106  * @param text A UText representing the text
   1107  * @param rangeStart The start of the range of dictionary characters
   1108  * @param rangeEnd The end of the range of dictionary characters
   1109  * @param foundBreaks vector<int32> to receive the break positions
   1110  * @return The number of breaks found
   1111  */
   1112 int32_t
   1113 CjkBreakEngine::divideUpDictionaryRange( UText *inText,
   1114         int32_t rangeStart,
   1115         int32_t rangeEnd,
   1116         UVector32 &foundBreaks ) const {
   1117     if (rangeStart >= rangeEnd) {
   1118         return 0;
   1119     }
   1120 
   1121     // UnicodeString version of input UText, NFKC normalized if necessary.
   1122     UnicodeString inString;
   1123 
   1124     // inputMap[inStringIndex] = corresponding native index from UText inText.
   1125     // If NULL then mapping is 1:1
   1126     LocalPointer<UVector32>     inputMap;
   1127 
   1128     UErrorCode     status      = U_ZERO_ERROR;
   1129 
   1130 
   1131     // if UText has the input string as one contiguous UTF-16 chunk
   1132     if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS)) &&
   1133          inText->chunkNativeStart <= rangeStart &&
   1134          inText->chunkNativeLimit >= rangeEnd   &&
   1135          inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) {
   1136 
   1137         // Input UText is in one contiguous UTF-16 chunk.
   1138         // Use Read-only aliasing UnicodeString.
   1139         inString.setTo(FALSE,
   1140                        inText->chunkContents + rangeStart - inText->chunkNativeStart,
   1141                        rangeEnd - rangeStart);
   1142     } else {
   1143         // Copy the text from the original inText (UText) to inString (UnicodeString).
   1144         // Create a map from UnicodeString indices -> UText offsets.
   1145         utext_setNativeIndex(inText, rangeStart);
   1146         int32_t limit = rangeEnd;
   1147         U_ASSERT(limit <= utext_nativeLength(inText));
   1148         if (limit > utext_nativeLength(inText)) {
   1149             limit = (int32_t)utext_nativeLength(inText);
   1150         }
   1151         inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
   1152         if (U_FAILURE(status)) {
   1153             return 0;
   1154         }
   1155         while (utext_getNativeIndex(inText) < limit) {
   1156             int32_t nativePosition = (int32_t)utext_getNativeIndex(inText);
   1157             UChar32 c = utext_next32(inText);
   1158             U_ASSERT(c != U_SENTINEL);
   1159             inString.append(c);
   1160             while (inputMap->size() < inString.length()) {
   1161                 inputMap->addElement(nativePosition, status);
   1162             }
   1163         }
   1164         inputMap->addElement(limit, status);
   1165     }
   1166 
   1167 
   1168     if (!nfkcNorm2->isNormalized(inString, status)) {
   1169         UnicodeString normalizedInput;
   1170         //  normalizedMap[normalizedInput position] ==  original UText position.
   1171         LocalPointer<UVector32> normalizedMap(new UVector32(status), status);
   1172         if (U_FAILURE(status)) {
   1173             return 0;
   1174         }
   1175 
   1176         UnicodeString fragment;
   1177         UnicodeString normalizedFragment;
   1178         for (int32_t srcI = 0; srcI < inString.length();) {  // Once per normalization chunk
   1179             fragment.remove();
   1180             int32_t fragmentStartI = srcI;
   1181             UChar32 c = inString.char32At(srcI);
   1182             for (;;) {
   1183                 fragment.append(c);
   1184                 srcI = inString.moveIndex32(srcI, 1);
   1185                 if (srcI == inString.length()) {
   1186                     break;
   1187                 }
   1188                 c = inString.char32At(srcI);
   1189                 if (nfkcNorm2->hasBoundaryBefore(c)) {
   1190                     break;
   1191                 }
   1192             }
   1193             nfkcNorm2->normalize(fragment, normalizedFragment, status);
   1194             normalizedInput.append(normalizedFragment);
   1195 
   1196             // Map every position in the normalized chunk to the start of the chunk
   1197             //   in the original input.
   1198             int32_t fragmentOriginalStart = inputMap.isValid() ?
   1199                     inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart;
   1200             while (normalizedMap->size() < normalizedInput.length()) {
   1201                 normalizedMap->addElement(fragmentOriginalStart, status);
   1202                 if (U_FAILURE(status)) {
   1203                     break;
   1204                 }
   1205             }
   1206         }
   1207         U_ASSERT(normalizedMap->size() == normalizedInput.length());
   1208         int32_t nativeEnd = inputMap.isValid() ?
   1209                 inputMap->elementAti(inString.length()) : inString.length()+rangeStart;
   1210         normalizedMap->addElement(nativeEnd, status);
   1211 
   1212         inputMap.moveFrom(normalizedMap);
   1213         inString.moveFrom(normalizedInput);
   1214     }
   1215 
   1216     int32_t numCodePts = inString.countChar32();
   1217     if (numCodePts != inString.length()) {
   1218         // There are supplementary characters in the input.
   1219         // The dictionary will produce boundary positions in terms of code point indexes,
   1220         //   not in terms of code unit string indexes.
   1221         // Use the inputMap mechanism to take care of this in addition to indexing differences
   1222         //    from normalization and/or UTF-8 input.
   1223         UBool hadExistingMap = inputMap.isValid();
   1224         if (!hadExistingMap) {
   1225             inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
   1226             if (U_FAILURE(status)) {
   1227                 return 0;
   1228             }
   1229         }
   1230         int32_t cpIdx = 0;
   1231         for (int32_t cuIdx = 0; ; cuIdx = inString.moveIndex32(cuIdx, 1)) {
   1232             U_ASSERT(cuIdx >= cpIdx);
   1233             if (hadExistingMap) {
   1234                 inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx);
   1235             } else {
   1236                 inputMap->addElement(cuIdx+rangeStart, status);
   1237             }
   1238             cpIdx++;
   1239             if (cuIdx == inString.length()) {
   1240                break;
   1241             }
   1242         }
   1243     }
   1244 
   1245     // bestSnlp[i] is the snlp of the best segmentation of the first i
   1246     // code points in the range to be matched.
   1247     UVector32 bestSnlp(numCodePts + 1, status);
   1248     bestSnlp.addElement(0, status);
   1249     for(int32_t i = 1; i <= numCodePts; i++) {
   1250         bestSnlp.addElement(kuint32max, status);
   1251     }
   1252 
   1253 
   1254     // prev[i] is the index of the last CJK code point in the previous word in
   1255     // the best segmentation of the first i characters.
   1256     UVector32 prev(numCodePts + 1, status);
   1257     for(int32_t i = 0; i <= numCodePts; i++){
   1258         prev.addElement(-1, status);
   1259     }
   1260 
   1261     const int32_t maxWordSize = 20;
   1262     UVector32 values(numCodePts, status);
   1263     values.setSize(numCodePts);
   1264     UVector32 lengths(numCodePts, status);
   1265     lengths.setSize(numCodePts);
   1266 
   1267     UText fu = UTEXT_INITIALIZER;
   1268     utext_openUnicodeString(&fu, &inString, &status);
   1269 
   1270     // Dynamic programming to find the best segmentation.
   1271 
   1272     // In outer loop, i  is the code point index,
   1273     //                ix is the corresponding string (code unit) index.
   1274     //    They differ when the string contains supplementary characters.
   1275     int32_t ix = 0;
   1276     bool is_prev_katakana = false;
   1277     for (int32_t i = 0;  i < numCodePts;  ++i, ix = inString.moveIndex32(ix, 1)) {
   1278         if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
   1279             continue;
   1280         }
   1281 
   1282         int32_t count;
   1283         utext_setNativeIndex(&fu, ix);
   1284         count = fDictionary->matches(&fu, maxWordSize, numCodePts,
   1285                              NULL, lengths.getBuffer(), values.getBuffer(), NULL);
   1286                              // Note: lengths is filled with code point lengths
   1287                              //       The NULL parameter is the ignored code unit lengths.
   1288 
   1289         // if there are no single character matches found in the dictionary
   1290         // starting with this character, treat character as a 1-character word
   1291         // with the highest value possible, i.e. the least likely to occur.
   1292         // Exclude Korean characters from this treatment, as they should be left
   1293         // together by default.
   1294         if ((count == 0 || lengths.elementAti(0) != 1) &&
   1295                 !fHangulWordSet.contains(inString.char32At(ix))) {
   1296             values.setElementAt(maxSnlp, count);   // 255
   1297             lengths.setElementAt(1, count++);
   1298         }
   1299 
   1300         for (int32_t j = 0; j < count; j++) {
   1301             uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)values.elementAti(j);
   1302             int32_t ln_j_i = lengths.elementAti(j) + i;
   1303             if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) {
   1304                 bestSnlp.setElementAt(newSnlp, ln_j_i);
   1305                 prev.setElementAt(i, ln_j_i);
   1306             }
   1307         }
   1308 
   1309         // In Japanese,
   1310         // Katakana word in single character is pretty rare. So we apply
   1311         // the following heuristic to Katakana: any continuous run of Katakana
   1312         // characters is considered a candidate word with a default cost
   1313         // specified in the katakanaCost table according to its length.
   1314 
   1315         bool is_katakana = isKatakana(inString.char32At(ix));
   1316         int32_t katakanaRunLength = 1;
   1317         if (!is_prev_katakana && is_katakana) {
   1318             int32_t j = inString.moveIndex32(ix, 1);
   1319             // Find the end of the continuous run of Katakana characters
   1320             while (j < inString.length() && katakanaRunLength < kMaxKatakanaGroupLength &&
   1321                     isKatakana(inString.char32At(j))) {
   1322                 j = inString.moveIndex32(j, 1);
   1323                 katakanaRunLength++;
   1324             }
   1325             if (katakanaRunLength < kMaxKatakanaGroupLength) {
   1326                 uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);
   1327                 if (newSnlp < (uint32_t)bestSnlp.elementAti(j)) {
   1328                     bestSnlp.setElementAt(newSnlp, j);
   1329                     prev.setElementAt(i, i+katakanaRunLength);  // prev[j] = i;
   1330                 }
   1331             }
   1332         }
   1333         is_prev_katakana = is_katakana;
   1334     }
   1335     utext_close(&fu);
   1336 
   1337     // Start pushing the optimal offset index into t_boundary (t for tentative).
   1338     // prev[numCodePts] is guaranteed to be meaningful.
   1339     // We'll first push in the reverse order, i.e.,
   1340     // t_boundary[0] = numCodePts, and afterwards do a swap.
   1341     UVector32 t_boundary(numCodePts+1, status);
   1342 
   1343     int32_t numBreaks = 0;
   1344     // No segmentation found, set boundary to end of range
   1345     if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
   1346         t_boundary.addElement(numCodePts, status);
   1347         numBreaks++;
   1348     } else {
   1349         for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) {
   1350             t_boundary.addElement(i, status);
   1351             numBreaks++;
   1352         }
   1353         U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);
   1354     }
   1355 
   1356     // Add a break for the start of the dictionary range if there is not one
   1357     // there already.
   1358     if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
   1359         t_boundary.addElement(0, status);
   1360         numBreaks++;
   1361     }
   1362 
   1363     // Now that we're done, convert positions in t_boundary[] (indices in
   1364     // the normalized input string) back to indices in the original input UText
   1365     // while reversing t_boundary and pushing values to foundBreaks.
   1366     int32_t prevCPPos = -1;
   1367     int32_t prevUTextPos = -1;
   1368     for (int32_t i = numBreaks-1; i >= 0; i--) {
   1369         int32_t cpPos = t_boundary.elementAti(i);
   1370         U_ASSERT(cpPos > prevCPPos);
   1371         int32_t utextPos =  inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
   1372         U_ASSERT(utextPos >= prevUTextPos);
   1373         if (utextPos > prevUTextPos) {
   1374             // Boundaries are added to foundBreaks output in ascending order.
   1375             U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos);
   1376             foundBreaks.push(utextPos, status);
   1377         } else {
   1378             // Normalization expanded the input text, the dictionary found a boundary
   1379             // within the expansion, giving two boundaries with the same index in the
   1380             // original text. Ignore the second. See ticket #12918.
   1381             --numBreaks;
   1382         }
   1383         prevCPPos = cpPos;
   1384         prevUTextPos = utextPos;
   1385     }
   1386     (void)prevCPPos; // suppress compiler warnings about unused variable
   1387 
   1388     // inString goes out of scope
   1389     // inputMap goes out of scope
   1390     return numBreaks;
   1391 }
   1392 #endif
   1393 
   1394 U_NAMESPACE_END
   1395 
   1396 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
   1397 
   1398