Home | History | Annotate | Download | only in common
      1 /**
      2  *******************************************************************************
      3  * Copyright (C) 2006-2013, International Business Machines Corporation
      4  * and others. All Rights Reserved.
      5  *******************************************************************************
      6  */
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #if !UCONFIG_NO_BREAK_ITERATION
     11 
     12 #include "brkeng.h"
     13 #include "dictbe.h"
     14 #include "unicode/uniset.h"
     15 #include "unicode/chariter.h"
     16 #include "unicode/ubrk.h"
     17 #include "uvector.h"
     18 #include "uassert.h"
     19 #include "unicode/normlzr.h"
     20 #include "cmemory.h"
     21 #include "dictionarydata.h"
     22 
     23 U_NAMESPACE_BEGIN
     24 
     25 /*
     26  ******************************************************************
     27  */
     28 
     29 DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) {
     30     fTypes = breakTypes;
     31 }
     32 
     33 DictionaryBreakEngine::~DictionaryBreakEngine() {
     34 }
     35 
     36 UBool
     37 DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const {
     38     return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)
     39             && fSet.contains(c));
     40 }
     41 
     42 int32_t
     43 DictionaryBreakEngine::findBreaks( UText *text,
     44                                  int32_t startPos,
     45                                  int32_t endPos,
     46                                  UBool reverse,
     47                                  int32_t breakType,
     48                                  UStack &foundBreaks ) const {
     49     int32_t result = 0;
     50 
     51     // Find the span of characters included in the set.
     52     int32_t start = (int32_t)utext_getNativeIndex(text);
     53     int32_t current;
     54     int32_t rangeStart;
     55     int32_t rangeEnd;
     56     UChar32 c = utext_current32(text);
     57     if (reverse) {
     58         UBool   isDict = fSet.contains(c);
     59         while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) {
     60             c = utext_previous32(text);
     61             isDict = fSet.contains(c);
     62         }
     63         rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1);
     64         rangeEnd = start + 1;
     65     }
     66     else {
     67         while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
     68             utext_next32(text);         // TODO:  recast loop for postincrement
     69             c = utext_current32(text);
     70         }
     71         rangeStart = start;
     72         rangeEnd = current;
     73     }
     74     if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
     75         result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
     76         utext_setNativeIndex(text, current);
     77     }
     78 
     79     return result;
     80 }
     81 
     82 void
     83 DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
     84     fSet = set;
     85     // Compact for caching
     86     fSet.compact();
     87 }
     88 
     89 /*
     90  ******************************************************************
     91  * PossibleWord
     92  */
     93 
     94 // Helper class for improving readability of the Thai/Lao/Khmer word break
     95 // algorithm. The implementation is completely inline.
     96 
     97 // List size, limited by the maximum number of words in the dictionary
     98 // that form a nested sequence.
     99 #define POSSIBLE_WORD_LIST_MAX 20
    100 
    101 class PossibleWord {
    102 private:
    103     // list of word candidate lengths, in increasing length order
    104     int32_t   lengths[POSSIBLE_WORD_LIST_MAX];
    105     int32_t   count;      // Count of candidates
    106     int32_t   prefix;     // The longest match with a dictionary word
    107     int32_t   offset;     // Offset in the text of these candidates
    108     int       mark;       // The preferred candidate's offset
    109     int       current;    // The candidate we're currently looking at
    110 
    111 public:
    112     PossibleWord();
    113     ~PossibleWord();
    114 
    115     // Fill the list of candidates if needed, select the longest, and return the number found
    116     int       candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
    117 
    118     // Select the currently marked candidate, point after it in the text, and invalidate self
    119     int32_t   acceptMarked( UText *text );
    120 
    121     // Back up from the current candidate to the next shorter one; return TRUE if that exists
    122     // and point the text after it
    123     UBool     backUp( UText *text );
    124 
    125     // Return the longest prefix this candidate location shares with a dictionary word
    126     int32_t   longestPrefix();
    127 
    128     // Mark the current candidate as the one we like
    129     void      markCurrent();
    130 };
    131 
    132 inline
    133 PossibleWord::PossibleWord() {
    134     offset = -1;
    135 }
    136 
    137 inline
    138 PossibleWord::~PossibleWord() {
    139 }
    140 
    141 inline int
    142 PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
    143     // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
    144     int32_t start = (int32_t)utext_getNativeIndex(text);
    145     if (start != offset) {
    146         offset = start;
    147         prefix = dict->matches(text, rangeEnd-start, lengths, count, sizeof(lengths)/sizeof(lengths[0]));
    148         // Dictionary leaves text after longest prefix, not longest word. Back up.
    149         if (count <= 0) {
    150             utext_setNativeIndex(text, start);
    151         }
    152     }
    153     if (count > 0) {
    154         utext_setNativeIndex(text, start+lengths[count-1]);
    155     }
    156     current = count-1;
    157     mark = current;
    158     return count;
    159 }
    160 
    161 inline int32_t
    162 PossibleWord::acceptMarked( UText *text ) {
    163     utext_setNativeIndex(text, offset + lengths[mark]);
    164     return lengths[mark];
    165 }
    166 
    167 inline UBool
    168 PossibleWord::backUp( UText *text ) {
    169     if (current > 0) {
    170         utext_setNativeIndex(text, offset + lengths[--current]);
    171         return TRUE;
    172     }
    173     return FALSE;
    174 }
    175 
    176 inline int32_t
    177 PossibleWord::longestPrefix() {
    178     return prefix;
    179 }
    180 
    181 inline void
    182 PossibleWord::markCurrent() {
    183     mark = current;
    184 }
    185 
    186 /*
    187  ******************************************************************
    188  * ThaiBreakEngine
    189  */
    190 
    191 // How many words in a row are "good enough"?
    192 #define THAI_LOOKAHEAD 3
    193 
    194 // Will not combine a non-word with a preceding dictionary word longer than this
    195 #define THAI_ROOT_COMBINE_THRESHOLD 3
    196 
    197 // Will not combine a non-word that shares at least this much prefix with a
    198 // dictionary word, with a preceding word
    199 #define THAI_PREFIX_COMBINE_THRESHOLD 3
    200 
    201 // Ellision character
    202 #define THAI_PAIYANNOI 0x0E2F
    203 
    204 // Repeat character
    205 #define THAI_MAIYAMOK 0x0E46
    206 
    207 // Minimum word size
    208 #define THAI_MIN_WORD 2
    209 
    210 // Minimum number of characters for two words
    211 #define THAI_MIN_WORD_SPAN (THAI_MIN_WORD * 2)
    212 
    213 ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
    214     : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
    215       fDictionary(adoptDictionary)
    216 {
    217     fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
    218     if (U_SUCCESS(status)) {
    219         setCharacters(fThaiWordSet);
    220     }
    221     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
    222     fMarkSet.add(0x0020);
    223     fEndWordSet = fThaiWordSet;
    224     fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
    225     fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
    226     fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
    227     fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
    228     fSuffixSet.add(THAI_PAIYANNOI);
    229     fSuffixSet.add(THAI_MAIYAMOK);
    230 
    231     // Compact for caching.
    232     fMarkSet.compact();
    233     fEndWordSet.compact();
    234     fBeginWordSet.compact();
    235     fSuffixSet.compact();
    236 }
    237 
    238 ThaiBreakEngine::~ThaiBreakEngine() {
    239     delete fDictionary;
    240 }
    241 
    242 int32_t
    243 ThaiBreakEngine::divideUpDictionaryRange( UText *text,
    244                                                 int32_t rangeStart,
    245                                                 int32_t rangeEnd,
    246                                                 UStack &foundBreaks ) const {
    247     if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) {
    248         return 0;       // Not enough characters for two words
    249     }
    250 
    251     uint32_t wordsFound = 0;
    252     int32_t wordLength;
    253     int32_t current;
    254     UErrorCode status = U_ZERO_ERROR;
    255     PossibleWord words[THAI_LOOKAHEAD];
    256     UChar32 uc;
    257 
    258     utext_setNativeIndex(text, rangeStart);
    259 
    260     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
    261         wordLength = 0;
    262 
    263         // Look for candidate words at the current position
    264         int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    265 
    266         // If we found exactly one, use that
    267         if (candidates == 1) {
    268             wordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
    269             wordsFound += 1;
    270         }
    271         // If there was more than one, see which one can take us forward the most words
    272         else if (candidates > 1) {
    273             // If we're already at the end of the range, we're done
    274             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
    275                 goto foundBest;
    276             }
    277             do {
    278                 int wordsMatched = 1;
    279                 if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
    280                     if (wordsMatched < 2) {
    281                         // Followed by another dictionary word; mark first word as a good candidate
    282                         words[wordsFound%THAI_LOOKAHEAD].markCurrent();
    283                         wordsMatched = 2;
    284                     }
    285 
    286                     // If we're already at the end of the range, we're done
    287                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
    288                         goto foundBest;
    289                     }
    290 
    291                     // See if any of the possible second words is followed by a third word
    292                     do {
    293                         // If we find a third word, stop right away
    294                         if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
    295                             words[wordsFound % THAI_LOOKAHEAD].markCurrent();
    296                             goto foundBest;
    297                         }
    298                     }
    299                     while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));
    300                 }
    301             }
    302             while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
    303 foundBest:
    304             wordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
    305             wordsFound += 1;
    306         }
    307 
    308         // We come here after having either found a word or not. We look ahead to the
    309         // next word. If it's not a dictionary word, we will combine it withe the word we
    310         // just found (if there is one), but only if the preceding word does not exceed
    311         // the threshold.
    312         // The text iterator should now be positioned at the end of the word we found.
    313         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) {
    314             // if it is a dictionary word, do nothing. If it isn't, then if there is
    315             // no preceding word, or the non-word shares less than the minimum threshold
    316             // of characters with a dictionary word, then scan to resynchronize
    317             if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
    318                   && (wordLength == 0
    319                       || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
    320                 // Look for a plausible word boundary
    321                 //TODO: This section will need a rework for UText.
    322                 int32_t remaining = rangeEnd - (current+wordLength);
    323                 UChar32 pc = utext_current32(text);
    324                 int32_t chars = 0;
    325                 for (;;) {
    326                     utext_next32(text);
    327                     uc = utext_current32(text);
    328                     // TODO: Here we're counting on the fact that the SA languages are all
    329                     // in the BMP. This should get fixed with the UText rework.
    330                     chars += 1;
    331                     if (--remaining <= 0) {
    332                         break;
    333                     }
    334                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
    335                         // Maybe. See if it's in the dictionary.
    336                         // NOTE: In the original Apple code, checked that the next
    337                         // two characters after uc were not 0x0E4C THANTHAKHAT before
    338                         // checking the dictionary. That is just a performance filter,
    339                         // but it's not clear it's faster than checking the trie.
    340                         int candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    341                         utext_setNativeIndex(text, current + wordLength + chars);
    342                         if (candidates > 0) {
    343                             break;
    344                         }
    345                     }
    346                     pc = uc;
    347                 }
    348 
    349                 // Bump the word count if there wasn't already one
    350                 if (wordLength <= 0) {
    351                     wordsFound += 1;
    352                 }
    353 
    354                 // Update the length with the passed-over characters
    355                 wordLength += chars;
    356             }
    357             else {
    358                 // Back up to where we were for next iteration
    359                 utext_setNativeIndex(text, current+wordLength);
    360             }
    361         }
    362 
    363         // Never stop before a combining mark.
    364         int32_t currPos;
    365         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
    366             utext_next32(text);
    367             wordLength += (int32_t)utext_getNativeIndex(text) - currPos;
    368         }
    369 
    370         // Look ahead for possible suffixes if a dictionary word does not follow.
    371         // We do this in code rather than using a rule so that the heuristic
    372         // resynch continues to function. For example, one of the suffix characters
    373         // could be a typo in the middle of a word.
    374         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
    375             if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
    376                 && fSuffixSet.contains(uc = utext_current32(text))) {
    377                 if (uc == THAI_PAIYANNOI) {
    378                     if (!fSuffixSet.contains(utext_previous32(text))) {
    379                         // Skip over previous end and PAIYANNOI
    380                         utext_next32(text);
    381                         utext_next32(text);
    382                         wordLength += 1;            // Add PAIYANNOI to word
    383                         uc = utext_current32(text);     // Fetch next character
    384                     }
    385                     else {
    386                         // Restore prior position
    387                         utext_next32(text);
    388                     }
    389                 }
    390                 if (uc == THAI_MAIYAMOK) {
    391                     if (utext_previous32(text) != THAI_MAIYAMOK) {
    392                         // Skip over previous end and MAIYAMOK
    393                         utext_next32(text);
    394                         utext_next32(text);
    395                         wordLength += 1;            // Add MAIYAMOK to word
    396                     }
    397                     else {
    398                         // Restore prior position
    399                         utext_next32(text);
    400                     }
    401                 }
    402             }
    403             else {
    404                 utext_setNativeIndex(text, current+wordLength);
    405             }
    406         }
    407 
    408         // Did we find a word on this iteration? If so, push it on the break stack
    409         if (wordLength > 0) {
    410             foundBreaks.push((current+wordLength), status);
    411         }
    412     }
    413 
    414     // Don't return a break for the end of the dictionary range if there is one there.
    415     if (foundBreaks.peeki() >= rangeEnd) {
    416         (void) foundBreaks.popi();
    417         wordsFound -= 1;
    418     }
    419 
    420     return wordsFound;
    421 }
    422 
    423 /*
    424  ******************************************************************
    425  * LaoBreakEngine
    426  */
    427 
    428 // How many words in a row are "good enough"?
    429 #define LAO_LOOKAHEAD 3
    430 
    431 // Will not combine a non-word with a preceding dictionary word longer than this
    432 #define LAO_ROOT_COMBINE_THRESHOLD 3
    433 
    434 // Will not combine a non-word that shares at least this much prefix with a
    435 // dictionary word, with a preceding word
    436 #define LAO_PREFIX_COMBINE_THRESHOLD 3
    437 
    438 // Minimum word size
    439 #define LAO_MIN_WORD 2
    440 
    441 // Minimum number of characters for two words
    442 #define LAO_MIN_WORD_SPAN (LAO_MIN_WORD * 2)
    443 
    444 LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
    445     : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
    446       fDictionary(adoptDictionary)
    447 {
    448     fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
    449     if (U_SUCCESS(status)) {
    450         setCharacters(fLaoWordSet);
    451     }
    452     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
    453     fMarkSet.add(0x0020);
    454     fEndWordSet = fLaoWordSet;
    455     fEndWordSet.remove(0x0EC0, 0x0EC4);     // prefix vowels
    456     fBeginWordSet.add(0x0E81, 0x0EAE);      // basic consonants (including holes for corresponding Thai characters)
    457     fBeginWordSet.add(0x0EDC, 0x0EDD);      // digraph consonants (no Thai equivalent)
    458     fBeginWordSet.add(0x0EC0, 0x0EC4);      // prefix vowels
    459 
    460     // Compact for caching.
    461     fMarkSet.compact();
    462     fEndWordSet.compact();
    463     fBeginWordSet.compact();
    464 }
    465 
    466 LaoBreakEngine::~LaoBreakEngine() {
    467     delete fDictionary;
    468 }
    469 
    470 int32_t
    471 LaoBreakEngine::divideUpDictionaryRange( UText *text,
    472                                                 int32_t rangeStart,
    473                                                 int32_t rangeEnd,
    474                                                 UStack &foundBreaks ) const {
    475     if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
    476         return 0;       // Not enough characters for two words
    477     }
    478 
    479     uint32_t wordsFound = 0;
    480     int32_t wordLength;
    481     int32_t current;
    482     UErrorCode status = U_ZERO_ERROR;
    483     PossibleWord words[LAO_LOOKAHEAD];
    484     UChar32 uc;
    485 
    486     utext_setNativeIndex(text, rangeStart);
    487 
    488     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
    489         wordLength = 0;
    490 
    491         // Look for candidate words at the current position
    492         int candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    493 
    494         // If we found exactly one, use that
    495         if (candidates == 1) {
    496             wordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
    497             wordsFound += 1;
    498         }
    499         // If there was more than one, see which one can take us forward the most words
    500         else if (candidates > 1) {
    501             // If we're already at the end of the range, we're done
    502             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
    503                 goto foundBest;
    504             }
    505             do {
    506                 int wordsMatched = 1;
    507                 if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
    508                     if (wordsMatched < 2) {
    509                         // Followed by another dictionary word; mark first word as a good candidate
    510                         words[wordsFound%LAO_LOOKAHEAD].markCurrent();
    511                         wordsMatched = 2;
    512                     }
    513 
    514                     // If we're already at the end of the range, we're done
    515                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
    516                         goto foundBest;
    517                     }
    518 
    519                     // See if any of the possible second words is followed by a third word
    520                     do {
    521                         // If we find a third word, stop right away
    522                         if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
    523                             words[wordsFound % LAO_LOOKAHEAD].markCurrent();
    524                             goto foundBest;
    525                         }
    526                     }
    527                     while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));
    528                 }
    529             }
    530             while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
    531 foundBest:
    532             wordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
    533             wordsFound += 1;
    534         }
    535 
    536         // We come here after having either found a word or not. We look ahead to the
    537         // next word. If it's not a dictionary word, we will combine it withe the word we
    538         // just found (if there is one), but only if the preceding word does not exceed
    539         // the threshold.
    540         // The text iterator should now be positioned at the end of the word we found.
    541         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < LAO_ROOT_COMBINE_THRESHOLD) {
    542             // if it is a dictionary word, do nothing. If it isn't, then if there is
    543             // no preceding word, or the non-word shares less than the minimum threshold
    544             // of characters with a dictionary word, then scan to resynchronize
    545             if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
    546                   && (wordLength == 0
    547                       || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
    548                 // Look for a plausible word boundary
    549                 //TODO: This section will need a rework for UText.
    550                 int32_t remaining = rangeEnd - (current+wordLength);
    551                 UChar32 pc = utext_current32(text);
    552                 int32_t chars = 0;
    553                 for (;;) {
    554                     utext_next32(text);
    555                     uc = utext_current32(text);
    556                     // TODO: Here we're counting on the fact that the SA languages are all
    557                     // in the BMP. This should get fixed with the UText rework.
    558                     chars += 1;
    559                     if (--remaining <= 0) {
    560                         break;
    561                     }
    562                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
    563                         // Maybe. See if it's in the dictionary.
    564                         int candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    565                         utext_setNativeIndex(text, current + wordLength + chars);
    566                         if (candidates > 0) {
    567                             break;
    568                         }
    569                     }
    570                     pc = uc;
    571                 }
    572 
    573                 // Bump the word count if there wasn't already one
    574                 if (wordLength <= 0) {
    575                     wordsFound += 1;
    576                 }
    577 
    578                 // Update the length with the passed-over characters
    579                 wordLength += chars;
    580             }
    581             else {
    582                 // Back up to where we were for next iteration
    583                 utext_setNativeIndex(text, current+wordLength);
    584             }
    585         }
    586 
    587         // Never stop before a combining mark.
    588         int32_t currPos;
    589         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
    590             utext_next32(text);
    591             wordLength += (int32_t)utext_getNativeIndex(text) - currPos;
    592         }
    593 
    594         // Look ahead for possible suffixes if a dictionary word does not follow.
    595         // We do this in code rather than using a rule so that the heuristic
    596         // resynch continues to function. For example, one of the suffix characters
    597         // could be a typo in the middle of a word.
    598         // NOT CURRENTLY APPLICABLE TO LAO
    599 
    600         // Did we find a word on this iteration? If so, push it on the break stack
    601         if (wordLength > 0) {
    602             foundBreaks.push((current+wordLength), status);
    603         }
    604     }
    605 
    606     // Don't return a break for the end of the dictionary range if there is one there.
    607     if (foundBreaks.peeki() >= rangeEnd) {
    608         (void) foundBreaks.popi();
    609         wordsFound -= 1;
    610     }
    611 
    612     return wordsFound;
    613 }
    614 
    615 /*
    616  ******************************************************************
    617  * KhmerBreakEngine
    618  */
    619 
    620 // How many words in a row are "good enough"?
    621 #define KHMER_LOOKAHEAD 3
    622 
    623 // Will not combine a non-word with a preceding dictionary word longer than this
    624 #define KHMER_ROOT_COMBINE_THRESHOLD 10
    625 
    626 // Will not combine a non-word that shares at least this much prefix with a
    627 // dictionary word, with a preceding word
    628 #define KHMER_PREFIX_COMBINE_THRESHOLD 5
    629 
    630 // Minimum word size
    631 #define KHMER_MIN_WORD 2
    632 
    633 // Minimum number of characters for two words
    634 #define KHMER_MIN_WORD_SPAN (KHMER_MIN_WORD * 2)
    635 
    636 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
    637     : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)),
    638       fDictionary(adoptDictionary)
    639 {
    640     fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
    641     if (U_SUCCESS(status)) {
    642         setCharacters(fKhmerWordSet);
    643     }
    644     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
    645     fMarkSet.add(0x0020);
    646     fEndWordSet = fKhmerWordSet;
    647     fBeginWordSet.add(0x1780, 0x17B3);
    648     //fBeginWordSet.add(0x17A3, 0x17A4);      // deprecated vowels
    649     //fEndWordSet.remove(0x17A5, 0x17A9);     // Khmer independent vowels that can't end a word
    650     //fEndWordSet.remove(0x17B2);             // Khmer independent vowel that can't end a word
    651     fEndWordSet.remove(0x17D2);             // KHMER SIGN COENG that combines some following characters
    652     //fEndWordSet.remove(0x17B6, 0x17C5);     // Remove dependent vowels
    653 //    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
    654 //    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
    655 //    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
    656 //    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
    657 //    fSuffixSet.add(THAI_PAIYANNOI);
    658 //    fSuffixSet.add(THAI_MAIYAMOK);
    659 
    660     // Compact for caching.
    661     fMarkSet.compact();
    662     fEndWordSet.compact();
    663     fBeginWordSet.compact();
    664 //    fSuffixSet.compact();
    665 }
    666 
    667 KhmerBreakEngine::~KhmerBreakEngine() {
    668     delete fDictionary;
    669 }
    670 
    671 int32_t
    672 KhmerBreakEngine::divideUpDictionaryRange( UText *text,
    673                                                 int32_t rangeStart,
    674                                                 int32_t rangeEnd,
    675                                                 UStack &foundBreaks ) const {
    676     if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
    677         return 0;       // Not enough characters for two words
    678     }
    679 
    680     uint32_t wordsFound = 0;
    681     int32_t wordLength;
    682     int32_t current;
    683     UErrorCode status = U_ZERO_ERROR;
    684     PossibleWord words[KHMER_LOOKAHEAD];
    685     UChar32 uc;
    686 
    687     utext_setNativeIndex(text, rangeStart);
    688 
    689     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
    690         wordLength = 0;
    691 
    692         // Look for candidate words at the current position
    693         int candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    694 
    695         // If we found exactly one, use that
    696         if (candidates == 1) {
    697             wordLength = words[wordsFound%KHMER_LOOKAHEAD].acceptMarked(text);
    698             wordsFound += 1;
    699         }
    700 
    701         // If there was more than one, see which one can take us forward the most words
    702         else if (candidates > 1) {
    703             // If we're already at the end of the range, we're done
    704             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
    705                 goto foundBest;
    706             }
    707             do {
    708                 int wordsMatched = 1;
    709                 if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
    710                     if (wordsMatched < 2) {
    711                         // Followed by another dictionary word; mark first word as a good candidate
    712                         words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
    713                         wordsMatched = 2;
    714                     }
    715 
    716                     // If we're already at the end of the range, we're done
    717                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
    718                         goto foundBest;
    719                     }
    720 
    721                     // See if any of the possible second words is followed by a third word
    722                     do {
    723                         // If we find a third word, stop right away
    724                         if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
    725                             words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
    726                             goto foundBest;
    727                         }
    728                     }
    729                     while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
    730                 }
    731             }
    732             while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
    733 foundBest:
    734             wordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
    735             wordsFound += 1;
    736         }
    737 
    738         // We come here after having either found a word or not. We look ahead to the
    739         // next word. If it's not a dictionary word, we will combine it with the word we
    740         // just found (if there is one), but only if the preceding word does not exceed
    741         // the threshold.
    742         // The text iterator should now be positioned at the end of the word we found.
    743         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
    744             // if it is a dictionary word, do nothing. If it isn't, then if there is
    745             // no preceding word, or the non-word shares less than the minimum threshold
    746             // of characters with a dictionary word, then scan to resynchronize
    747             if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
    748                   && (wordLength == 0
    749                       || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
    750                 // Look for a plausible word boundary
    751                 //TODO: This section will need a rework for UText.
    752                 int32_t remaining = rangeEnd - (current+wordLength);
    753                 UChar32 pc = utext_current32(text);
    754                 int32_t chars = 0;
    755                 for (;;) {
    756                     utext_next32(text);
    757                     uc = utext_current32(text);
    758                     // TODO: Here we're counting on the fact that the SA languages are all
    759                     // in the BMP. This should get fixed with the UText rework.
    760                     chars += 1;
    761                     if (--remaining <= 0) {
    762                         break;
    763                     }
    764                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
    765                         // Maybe. See if it's in the dictionary.
    766                         int candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    767                         utext_setNativeIndex(text, current+wordLength+chars);
    768                         if (candidates > 0) {
    769                             break;
    770                         }
    771                     }
    772                     pc = uc;
    773                 }
    774 
    775                 // Bump the word count if there wasn't already one
    776                 if (wordLength <= 0) {
    777                     wordsFound += 1;
    778                 }
    779 
    780                 // Update the length with the passed-over characters
    781                 wordLength += chars;
    782             }
    783             else {
    784                 // Back up to where we were for next iteration
    785                 utext_setNativeIndex(text, current+wordLength);
    786             }
    787         }
    788 
    789         // Never stop before a combining mark.
    790         int32_t currPos;
    791         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
    792             utext_next32(text);
    793             wordLength += (int32_t)utext_getNativeIndex(text) - currPos;
    794         }
    795 
    796         // Look ahead for possible suffixes if a dictionary word does not follow.
    797         // We do this in code rather than using a rule so that the heuristic
    798         // resynch continues to function. For example, one of the suffix characters
    799         // could be a typo in the middle of a word.
    800 //        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
    801 //            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
    802 //                && fSuffixSet.contains(uc = utext_current32(text))) {
    803 //                if (uc == KHMER_PAIYANNOI) {
    804 //                    if (!fSuffixSet.contains(utext_previous32(text))) {
    805 //                        // Skip over previous end and PAIYANNOI
    806 //                        utext_next32(text);
    807 //                        utext_next32(text);
    808 //                        wordLength += 1;            // Add PAIYANNOI to word
    809 //                        uc = utext_current32(text);     // Fetch next character
    810 //                    }
    811 //                    else {
    812 //                        // Restore prior position
    813 //                        utext_next32(text);
    814 //                    }
    815 //                }
    816 //                if (uc == KHMER_MAIYAMOK) {
    817 //                    if (utext_previous32(text) != KHMER_MAIYAMOK) {
    818 //                        // Skip over previous end and MAIYAMOK
    819 //                        utext_next32(text);
    820 //                        utext_next32(text);
    821 //                        wordLength += 1;            // Add MAIYAMOK to word
    822 //                    }
    823 //                    else {
    824 //                        // Restore prior position
    825 //                        utext_next32(text);
    826 //                    }
    827 //                }
    828 //            }
    829 //            else {
    830 //                utext_setNativeIndex(text, current+wordLength);
    831 //            }
    832 //        }
    833 
    834         // Did we find a word on this iteration? If so, push it on the break stack
    835         if (wordLength > 0) {
    836             foundBreaks.push((current+wordLength), status);
    837         }
    838     }
    839 
    840     // Don't return a break for the end of the dictionary range if there is one there.
    841     if (foundBreaks.peeki() >= rangeEnd) {
    842         (void) foundBreaks.popi();
    843         wordsFound -= 1;
    844     }
    845 
    846     return wordsFound;
    847 }
    848 
    849 #if !UCONFIG_NO_NORMALIZATION
    850 /*
    851  ******************************************************************
    852  * CjkBreakEngine
    853  */
    854 static const uint32_t kuint32max = 0xFFFFFFFF;
    855 CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
    856 : DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) {
    857     // Korean dictionary only includes Hangul syllables
    858     fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
    859     fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
    860     fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
    861     fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
    862 
    863     if (U_SUCCESS(status)) {
    864         // handle Korean and Japanese/Chinese using different dictionaries
    865         if (type == kKorean) {
    866             setCharacters(fHangulWordSet);
    867         } else { //Chinese and Japanese
    868             UnicodeSet cjSet;
    869             cjSet.addAll(fHanWordSet);
    870             cjSet.addAll(fKatakanaWordSet);
    871             cjSet.addAll(fHiraganaWordSet);
    872             cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
    873             cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
    874             setCharacters(cjSet);
    875         }
    876     }
    877 }
    878 
    879 CjkBreakEngine::~CjkBreakEngine(){
    880     delete fDictionary;
    881 }
    882 
    883 // The katakanaCost values below are based on the length frequencies of all
    884 // katakana phrases in the dictionary
    885 static const int kMaxKatakanaLength = 8;
    886 static const int kMaxKatakanaGroupLength = 20;
    887 static const uint32_t maxSnlp = 255;
    888 
    889 static inline uint32_t getKatakanaCost(int wordLength){
    890     //TODO: fill array with actual values from dictionary!
    891     static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
    892                                        = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
    893     return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
    894 }
    895 
    896 static inline bool isKatakana(uint16_t value) {
    897     return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) ||
    898             (value >= 0xFF66u && value <= 0xFF9fu);
    899 }
    900 
    901 // A very simple helper class to streamline the buffer handling in
    902 // divideUpDictionaryRange.
    903 template<class T, size_t N>
    904 class AutoBuffer {
    905 public:
    906     AutoBuffer(size_t size) : buffer(stackBuffer), capacity(N) {
    907         if (size > N) {
    908             buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));
    909             capacity = size;
    910         }
    911     }
    912     ~AutoBuffer() {
    913         if (buffer != stackBuffer)
    914             uprv_free(buffer);
    915     }
    916 
    917     T* elems() {
    918         return buffer;
    919     }
    920 
    921     const T& operator[] (size_t i) const {
    922         return buffer[i];
    923     }
    924 
    925     T& operator[] (size_t i) {
    926         return buffer[i];
    927     }
    928 
    929     // resize without copy
    930     void resize(size_t size) {
    931         if (size <= capacity)
    932             return;
    933         if (buffer != stackBuffer)
    934             uprv_free(buffer);
    935         buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));
    936         capacity = size;
    937     }
    938 
    939 private:
    940     T stackBuffer[N];
    941     T* buffer;
    942     AutoBuffer();
    943     size_t capacity;
    944 };
    945 
    946 
    947 /*
    948  * @param text A UText representing the text
    949  * @param rangeStart The start of the range of dictionary characters
    950  * @param rangeEnd The end of the range of dictionary characters
    951  * @param foundBreaks Output of C array of int32_t break positions, or 0
    952  * @return The number of breaks found
    953  */
    954 int32_t
    955 CjkBreakEngine::divideUpDictionaryRange( UText *text,
    956         int32_t rangeStart,
    957         int32_t rangeEnd,
    958         UStack &foundBreaks ) const {
    959     if (rangeStart >= rangeEnd) {
    960         return 0;
    961     }
    962 
    963     const size_t defaultInputLength = 80;
    964     size_t inputLength = rangeEnd - rangeStart;
    965     // TODO: Replace by UnicodeString.
    966     AutoBuffer<UChar, defaultInputLength> charString(inputLength);
    967 
    968     // Normalize the input string and put it in normalizedText.
    969     // The map from the indices of the normalized input to the raw
    970     // input is kept in charPositions.
    971     UErrorCode status = U_ZERO_ERROR;
    972     utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, &status);
    973     if (U_FAILURE(status)) {
    974         return 0;
    975     }
    976 
    977     UnicodeString inputString(charString.elems(), inputLength);
    978     // TODO: Use Normalizer2.
    979     UNormalizationMode norm_mode = UNORM_NFKC;
    980     UBool isNormalized =
    981         Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES ||
    982         Normalizer::isNormalized(inputString, norm_mode, status);
    983 
    984     // TODO: Replace by UVector32.
    985     AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1);
    986     int numChars = 0;
    987     UText normalizedText = UTEXT_INITIALIZER;
    988     // Needs to be declared here because normalizedText holds onto its buffer.
    989     UnicodeString normalizedString;
    990     if (isNormalized) {
    991         int32_t index = 0;
    992         charPositions[0] = 0;
    993         while(index < inputString.length()) {
    994             index = inputString.moveIndex32(index, 1);
    995             charPositions[++numChars] = index;
    996         }
    997         utext_openUnicodeString(&normalizedText, &inputString, &status);
    998     }
    999     else {
   1000         Normalizer::normalize(inputString, norm_mode, 0, normalizedString, status);
   1001         if (U_FAILURE(status)) {
   1002             return 0;
   1003         }
   1004         charPositions.resize(normalizedString.length() + 1);
   1005         Normalizer normalizer(charString.elems(), inputLength, norm_mode);
   1006         int32_t index = 0;
   1007         charPositions[0] = 0;
   1008         while(index < normalizer.endIndex()){
   1009             /* UChar32 uc = */ normalizer.next();
   1010             charPositions[++numChars] = index = normalizer.getIndex();
   1011         }
   1012         utext_openUnicodeString(&normalizedText, &normalizedString, &status);
   1013     }
   1014 
   1015     if (U_FAILURE(status)) {
   1016         return 0;
   1017     }
   1018 
   1019     // From this point on, all the indices refer to the indices of
   1020     // the normalized input string.
   1021 
   1022     // bestSnlp[i] is the snlp of the best segmentation of the first i
   1023     // characters in the range to be matched.
   1024     // TODO: Replace by UVector32.
   1025     AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1);
   1026     bestSnlp[0] = 0;
   1027     for(int i = 1; i <= numChars; i++) {
   1028         bestSnlp[i] = kuint32max;
   1029     }
   1030 
   1031     // prev[i] is the index of the last CJK character in the previous word in
   1032     // the best segmentation of the first i characters.
   1033     // TODO: Replace by UVector32.
   1034     AutoBuffer<int, defaultInputLength> prev(numChars + 1);
   1035     for(int i = 0; i <= numChars; i++){
   1036         prev[i] = -1;
   1037     }
   1038 
   1039     const size_t maxWordSize = 20;
   1040     // TODO: Replace both with UVector32.
   1041     AutoBuffer<int32_t, maxWordSize> values(numChars);
   1042     AutoBuffer<int32_t, maxWordSize> lengths(numChars);
   1043 
   1044     // Dynamic programming to find the best segmentation.
   1045     bool is_prev_katakana = false;
   1046     for (int32_t i = 0; i < numChars; ++i) {
   1047         //utext_setNativeIndex(text, rangeStart + i);
   1048         utext_setNativeIndex(&normalizedText, i);
   1049         if (bestSnlp[i] == kuint32max)
   1050             continue;
   1051 
   1052         int32_t count;
   1053         // limit maximum word length matched to size of current substring
   1054         int32_t maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSize : (numChars - i);
   1055 
   1056         fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems());
   1057 
   1058         // if there are no single character matches found in the dictionary
   1059         // starting with this charcter, treat character as a 1-character word
   1060         // with the highest value possible, i.e. the least likely to occur.
   1061         // Exclude Korean characters from this treatment, as they should be left
   1062         // together by default.
   1063         if((count == 0 || lengths[0] != 1) &&
   1064                 !fHangulWordSet.contains(utext_current32(&normalizedText))) {
   1065             values[count] = maxSnlp;
   1066             lengths[count++] = 1;
   1067         }
   1068 
   1069         for (int j = 0; j < count; j++) {
   1070             uint32_t newSnlp = bestSnlp[i] + values[j];
   1071             if (newSnlp < bestSnlp[lengths[j] + i]) {
   1072                 bestSnlp[lengths[j] + i] = newSnlp;
   1073                 prev[lengths[j] + i] = i;
   1074             }
   1075         }
   1076 
   1077         // In Japanese,
   1078         // Katakana word in single character is pretty rare. So we apply
   1079         // the following heuristic to Katakana: any continuous run of Katakana
   1080         // characters is considered a candidate word with a default cost
   1081         // specified in the katakanaCost table according to its length.
   1082         //utext_setNativeIndex(text, rangeStart + i);
   1083         utext_setNativeIndex(&normalizedText, i);
   1084         bool is_katakana = isKatakana(utext_current32(&normalizedText));
   1085         if (!is_prev_katakana && is_katakana) {
   1086             int j = i + 1;
   1087             utext_next32(&normalizedText);
   1088             // Find the end of the continuous run of Katakana characters
   1089             while (j < numChars && (j - i) < kMaxKatakanaGroupLength &&
   1090                     isKatakana(utext_current32(&normalizedText))) {
   1091                 utext_next32(&normalizedText);
   1092                 ++j;
   1093             }
   1094             if ((j - i) < kMaxKatakanaGroupLength) {
   1095                 uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i);
   1096                 if (newSnlp < bestSnlp[j]) {
   1097                     bestSnlp[j] = newSnlp;
   1098                     prev[j] = i;
   1099                 }
   1100             }
   1101         }
   1102         is_prev_katakana = is_katakana;
   1103     }
   1104 
   1105     // Start pushing the optimal offset index into t_boundary (t for tentative).
   1106     // prev[numChars] is guaranteed to be meaningful.
   1107     // We'll first push in the reverse order, i.e.,
   1108     // t_boundary[0] = numChars, and afterwards do a swap.
   1109     // TODO: Replace by UVector32.
   1110     AutoBuffer<int, maxWordSize> t_boundary(numChars + 1);
   1111 
   1112     int numBreaks = 0;
   1113     // No segmentation found, set boundary to end of range
   1114     if (bestSnlp[numChars] == kuint32max) {
   1115         t_boundary[numBreaks++] = numChars;
   1116     } else {
   1117         for (int i = numChars; i > 0; i = prev[i]) {
   1118             t_boundary[numBreaks++] = i;
   1119         }
   1120         U_ASSERT(prev[t_boundary[numBreaks - 1]] == 0);
   1121     }
   1122 
   1123     // Reverse offset index in t_boundary.
   1124     // Don't add a break for the start of the dictionary range if there is one
   1125     // there already.
   1126     if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
   1127         t_boundary[numBreaks++] = 0;
   1128     }
   1129 
   1130     // Now that we're done, convert positions in t_bdry[] (indices in
   1131     // the normalized input string) back to indices in the raw input string
   1132     // while reversing t_bdry and pushing values to foundBreaks.
   1133     for (int i = numBreaks-1; i >= 0; i--) {
   1134         foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status);
   1135     }
   1136 
   1137     utext_close(&normalizedText);
   1138     return numBreaks;
   1139 }
   1140 #endif
   1141 
   1142 U_NAMESPACE_END
   1143 
   1144 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
   1145 
   1146