Home | History | Annotate | Download | only in common
      1 /**
      2  *******************************************************************************
      3  * Copyright (C) 2006-2014, International Business Machines Corporation
      4  * and others. All Rights Reserved.
      5  *******************************************************************************
      6  */
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #if !UCONFIG_NO_BREAK_ITERATION
     11 
     12 #include "brkeng.h"
     13 #include "dictbe.h"
     14 #include "unicode/uniset.h"
     15 #include "unicode/chariter.h"
     16 #include "unicode/ubrk.h"
     17 #include "uvector.h"
     18 #include "uassert.h"
     19 #include "unicode/normlzr.h"
     20 #include "cmemory.h"
     21 #include "dictionarydata.h"
     22 
     23 U_NAMESPACE_BEGIN
     24 
     25 /*
     26  ******************************************************************
     27  */
     28 
     29 DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) {
     30     fTypes = breakTypes;
     31 }
     32 
     33 DictionaryBreakEngine::~DictionaryBreakEngine() {
     34 }
     35 
     36 UBool
     37 DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const {
     38     return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)
     39             && fSet.contains(c));
     40 }
     41 
     42 int32_t
     43 DictionaryBreakEngine::findBreaks( UText *text,
     44                                  int32_t startPos,
     45                                  int32_t endPos,
     46                                  UBool reverse,
     47                                  int32_t breakType,
     48                                  UStack &foundBreaks ) const {
     49     int32_t result = 0;
     50 
     51     // Find the span of characters included in the set.
     52     //   The span to break begins at the current position in the text, and
     53     //   extends towards the start or end of the text, depending on 'reverse'.
     54 
     55     int32_t start = (int32_t)utext_getNativeIndex(text);
     56     int32_t current;
     57     int32_t rangeStart;
     58     int32_t rangeEnd;
     59     UChar32 c = utext_current32(text);
     60     if (reverse) {
     61         UBool   isDict = fSet.contains(c);
     62         while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) {
     63             c = utext_previous32(text);
     64             isDict = fSet.contains(c);
     65         }
     66         rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1);
     67         rangeEnd = start + 1;
     68     }
     69     else {
     70         while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
     71             utext_next32(text);         // TODO:  recast loop for postincrement
     72             c = utext_current32(text);
     73         }
     74         rangeStart = start;
     75         rangeEnd = current;
     76     }
     77     if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
     78         result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
     79         utext_setNativeIndex(text, current);
     80     }
     81 
     82     return result;
     83 }
     84 
     85 void
     86 DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
     87     fSet = set;
     88     // Compact for caching
     89     fSet.compact();
     90 }
     91 
     92 /*
     93  ******************************************************************
     94  * PossibleWord
     95  */
     96 
     97 // Helper class for improving readability of the Thai/Lao/Khmer word break
     98 // algorithm. The implementation is completely inline.
     99 
    100 // List size, limited by the maximum number of words in the dictionary
    101 // that form a nested sequence.
    102 #define POSSIBLE_WORD_LIST_MAX 20
    103 
    104 class PossibleWord {
    105 private:
    106     // list of word candidate lengths, in increasing length order
    107     int32_t   lengths[POSSIBLE_WORD_LIST_MAX];
    108     int32_t   count;      // Count of candidates
    109     int32_t   prefix;     // The longest match with a dictionary word
    110     int32_t   offset;     // Offset in the text of these candidates
    111     int       mark;       // The preferred candidate's offset
    112     int       current;    // The candidate we're currently looking at
    113 
    114 public:
    115     PossibleWord();
    116     ~PossibleWord();
    117 
    118     // Fill the list of candidates if needed, select the longest, and return the number found
    119     int       candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
    120 
    121     // Select the currently marked candidate, point after it in the text, and invalidate self
    122     int32_t   acceptMarked( UText *text );
    123 
    124     // Back up from the current candidate to the next shorter one; return TRUE if that exists
    125     // and point the text after it
    126     UBool     backUp( UText *text );
    127 
    128     // Return the longest prefix this candidate location shares with a dictionary word
    129     int32_t   longestPrefix();
    130 
    131     // Mark the current candidate as the one we like
    132     void      markCurrent();
    133 };
    134 
    135 inline
    136 PossibleWord::PossibleWord() {
    137     offset = -1;
    138 }
    139 
    140 inline
    141 PossibleWord::~PossibleWord() {
    142 }
    143 
    144 inline int
    145 PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
    146     // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
    147     int32_t start = (int32_t)utext_getNativeIndex(text);
    148     if (start != offset) {
    149         offset = start;
    150         prefix = dict->matches(text, rangeEnd-start, lengths, count, sizeof(lengths)/sizeof(lengths[0]));
    151         // Dictionary leaves text after longest prefix, not longest word. Back up.
    152         if (count <= 0) {
    153             utext_setNativeIndex(text, start);
    154         }
    155     }
    156     if (count > 0) {
    157         utext_setNativeIndex(text, start+lengths[count-1]);
    158     }
    159     current = count-1;
    160     mark = current;
    161     return count;
    162 }
    163 
    164 inline int32_t
    165 PossibleWord::acceptMarked( UText *text ) {
    166     utext_setNativeIndex(text, offset + lengths[mark]);
    167     return lengths[mark];
    168 }
    169 
    170 inline UBool
    171 PossibleWord::backUp( UText *text ) {
    172     if (current > 0) {
    173         utext_setNativeIndex(text, offset + lengths[--current]);
    174         return TRUE;
    175     }
    176     return FALSE;
    177 }
    178 
    179 inline int32_t
    180 PossibleWord::longestPrefix() {
    181     return prefix;
    182 }
    183 
    184 inline void
    185 PossibleWord::markCurrent() {
    186     mark = current;
    187 }
    188 
    189 /*
    190  ******************************************************************
    191  * ThaiBreakEngine
    192  */
    193 
    194 // How many words in a row are "good enough"?
    195 #define THAI_LOOKAHEAD 3
    196 
    197 // Will not combine a non-word with a preceding dictionary word longer than this
    198 #define THAI_ROOT_COMBINE_THRESHOLD 3
    199 
    200 // Will not combine a non-word that shares at least this much prefix with a
    201 // dictionary word, with a preceding word
    202 #define THAI_PREFIX_COMBINE_THRESHOLD 3
    203 
    204 // Ellision character
    205 #define THAI_PAIYANNOI 0x0E2F
    206 
    207 // Repeat character
    208 #define THAI_MAIYAMOK 0x0E46
    209 
    210 // Minimum word size
    211 #define THAI_MIN_WORD 2
    212 
    213 // Minimum number of characters for two words
    214 #define THAI_MIN_WORD_SPAN (THAI_MIN_WORD * 2)
    215 
    216 ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
    217     : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
    218       fDictionary(adoptDictionary)
    219 {
    220     fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
    221     if (U_SUCCESS(status)) {
    222         setCharacters(fThaiWordSet);
    223     }
    224     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
    225     fMarkSet.add(0x0020);
    226     fEndWordSet = fThaiWordSet;
    227     fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
    228     fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
    229     fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
    230     fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
    231     fSuffixSet.add(THAI_PAIYANNOI);
    232     fSuffixSet.add(THAI_MAIYAMOK);
    233 
    234     // Compact for caching.
    235     fMarkSet.compact();
    236     fEndWordSet.compact();
    237     fBeginWordSet.compact();
    238     fSuffixSet.compact();
    239 }
    240 
    241 ThaiBreakEngine::~ThaiBreakEngine() {
    242     delete fDictionary;
    243 }
    244 
    245 int32_t
    246 ThaiBreakEngine::divideUpDictionaryRange( UText *text,
    247                                                 int32_t rangeStart,
    248                                                 int32_t rangeEnd,
    249                                                 UStack &foundBreaks ) const {
    250     if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) {
    251         return 0;       // Not enough characters for two words
    252     }
    253 
    254     uint32_t wordsFound = 0;
    255     int32_t wordLength;
    256     int32_t current;
    257     UErrorCode status = U_ZERO_ERROR;
    258     PossibleWord words[THAI_LOOKAHEAD];
    259     UChar32 uc;
    260 
    261     utext_setNativeIndex(text, rangeStart);
    262 
    263     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
    264         wordLength = 0;
    265 
    266         // Look for candidate words at the current position
    267         int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    268 
    269         // If we found exactly one, use that
    270         if (candidates == 1) {
    271             wordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
    272             wordsFound += 1;
    273         }
    274         // If there was more than one, see which one can take us forward the most words
    275         else if (candidates > 1) {
    276             // If we're already at the end of the range, we're done
    277             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
    278                 goto foundBest;
    279             }
    280             do {
    281                 int wordsMatched = 1;
    282                 if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
    283                     if (wordsMatched < 2) {
    284                         // Followed by another dictionary word; mark first word as a good candidate
    285                         words[wordsFound%THAI_LOOKAHEAD].markCurrent();
    286                         wordsMatched = 2;
    287                     }
    288 
    289                     // If we're already at the end of the range, we're done
    290                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
    291                         goto foundBest;
    292                     }
    293 
    294                     // See if any of the possible second words is followed by a third word
    295                     do {
    296                         // If we find a third word, stop right away
    297                         if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
    298                             words[wordsFound % THAI_LOOKAHEAD].markCurrent();
    299                             goto foundBest;
    300                         }
    301                     }
    302                     while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));
    303                 }
    304             }
    305             while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
    306 foundBest:
    307             wordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
    308             wordsFound += 1;
    309         }
    310 
    311         // We come here after having either found a word or not. We look ahead to the
    312         // next word. If it's not a dictionary word, we will combine it withe the word we
    313         // just found (if there is one), but only if the preceding word does not exceed
    314         // the threshold.
    315         // The text iterator should now be positioned at the end of the word we found.
    316         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) {
    317             // if it is a dictionary word, do nothing. If it isn't, then if there is
    318             // no preceding word, or the non-word shares less than the minimum threshold
    319             // of characters with a dictionary word, then scan to resynchronize
    320             if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
    321                   && (wordLength == 0
    322                       || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
    323                 // Look for a plausible word boundary
    324                 //TODO: This section will need a rework for UText.
    325                 int32_t remaining = rangeEnd - (current+wordLength);
    326                 UChar32 pc = utext_current32(text);
    327                 int32_t chars = 0;
    328                 for (;;) {
    329                     utext_next32(text);
    330                     uc = utext_current32(text);
    331                     // TODO: Here we're counting on the fact that the SA languages are all
    332                     // in the BMP. This should get fixed with the UText rework.
    333                     chars += 1;
    334                     if (--remaining <= 0) {
    335                         break;
    336                     }
    337                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
    338                         // Maybe. See if it's in the dictionary.
    339                         // NOTE: In the original Apple code, checked that the next
    340                         // two characters after uc were not 0x0E4C THANTHAKHAT before
    341                         // checking the dictionary. That is just a performance filter,
    342                         // but it's not clear it's faster than checking the trie.
    343                         int candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    344                         utext_setNativeIndex(text, current + wordLength + chars);
    345                         if (candidates > 0) {
    346                             break;
    347                         }
    348                     }
    349                     pc = uc;
    350                 }
    351 
    352                 // Bump the word count if there wasn't already one
    353                 if (wordLength <= 0) {
    354                     wordsFound += 1;
    355                 }
    356 
    357                 // Update the length with the passed-over characters
    358                 wordLength += chars;
    359             }
    360             else {
    361                 // Back up to where we were for next iteration
    362                 utext_setNativeIndex(text, current+wordLength);
    363             }
    364         }
    365 
    366         // Never stop before a combining mark.
    367         int32_t currPos;
    368         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
    369             utext_next32(text);
    370             wordLength += (int32_t)utext_getNativeIndex(text) - currPos;
    371         }
    372 
    373         // Look ahead for possible suffixes if a dictionary word does not follow.
    374         // We do this in code rather than using a rule so that the heuristic
    375         // resynch continues to function. For example, one of the suffix characters
    376         // could be a typo in the middle of a word.
    377         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
    378             if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
    379                 && fSuffixSet.contains(uc = utext_current32(text))) {
    380                 if (uc == THAI_PAIYANNOI) {
    381                     if (!fSuffixSet.contains(utext_previous32(text))) {
    382                         // Skip over previous end and PAIYANNOI
    383                         utext_next32(text);
    384                         utext_next32(text);
    385                         wordLength += 1;            // Add PAIYANNOI to word
    386                         uc = utext_current32(text);     // Fetch next character
    387                     }
    388                     else {
    389                         // Restore prior position
    390                         utext_next32(text);
    391                     }
    392                 }
    393                 if (uc == THAI_MAIYAMOK) {
    394                     if (utext_previous32(text) != THAI_MAIYAMOK) {
    395                         // Skip over previous end and MAIYAMOK
    396                         utext_next32(text);
    397                         utext_next32(text);
    398                         wordLength += 1;            // Add MAIYAMOK to word
    399                     }
    400                     else {
    401                         // Restore prior position
    402                         utext_next32(text);
    403                     }
    404                 }
    405             }
    406             else {
    407                 utext_setNativeIndex(text, current+wordLength);
    408             }
    409         }
    410 
    411         // Did we find a word on this iteration? If so, push it on the break stack
    412         if (wordLength > 0) {
    413             foundBreaks.push((current+wordLength), status);
    414         }
    415     }
    416 
    417     // Don't return a break for the end of the dictionary range if there is one there.
    418     if (foundBreaks.peeki() >= rangeEnd) {
    419         (void) foundBreaks.popi();
    420         wordsFound -= 1;
    421     }
    422 
    423     return wordsFound;
    424 }
    425 
    426 /*
    427  ******************************************************************
    428  * LaoBreakEngine
    429  */
    430 
    431 // How many words in a row are "good enough"?
    432 #define LAO_LOOKAHEAD 3
    433 
    434 // Will not combine a non-word with a preceding dictionary word longer than this
    435 #define LAO_ROOT_COMBINE_THRESHOLD 3
    436 
    437 // Will not combine a non-word that shares at least this much prefix with a
    438 // dictionary word, with a preceding word
    439 #define LAO_PREFIX_COMBINE_THRESHOLD 3
    440 
    441 // Minimum word size
    442 #define LAO_MIN_WORD 2
    443 
    444 // Minimum number of characters for two words
    445 #define LAO_MIN_WORD_SPAN (LAO_MIN_WORD * 2)
    446 
    447 LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
    448     : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
    449       fDictionary(adoptDictionary)
    450 {
    451     fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
    452     if (U_SUCCESS(status)) {
    453         setCharacters(fLaoWordSet);
    454     }
    455     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
    456     fMarkSet.add(0x0020);
    457     fEndWordSet = fLaoWordSet;
    458     fEndWordSet.remove(0x0EC0, 0x0EC4);     // prefix vowels
    459     fBeginWordSet.add(0x0E81, 0x0EAE);      // basic consonants (including holes for corresponding Thai characters)
    460     fBeginWordSet.add(0x0EDC, 0x0EDD);      // digraph consonants (no Thai equivalent)
    461     fBeginWordSet.add(0x0EC0, 0x0EC4);      // prefix vowels
    462 
    463     // Compact for caching.
    464     fMarkSet.compact();
    465     fEndWordSet.compact();
    466     fBeginWordSet.compact();
    467 }
    468 
    469 LaoBreakEngine::~LaoBreakEngine() {
    470     delete fDictionary;
    471 }
    472 
    473 int32_t
    474 LaoBreakEngine::divideUpDictionaryRange( UText *text,
    475                                                 int32_t rangeStart,
    476                                                 int32_t rangeEnd,
    477                                                 UStack &foundBreaks ) const {
    478     if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
    479         return 0;       // Not enough characters for two words
    480     }
    481 
    482     uint32_t wordsFound = 0;
    483     int32_t wordLength;
    484     int32_t current;
    485     UErrorCode status = U_ZERO_ERROR;
    486     PossibleWord words[LAO_LOOKAHEAD];
    487     UChar32 uc;
    488 
    489     utext_setNativeIndex(text, rangeStart);
    490 
    491     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
    492         wordLength = 0;
    493 
    494         // Look for candidate words at the current position
    495         int candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    496 
    497         // If we found exactly one, use that
    498         if (candidates == 1) {
    499             wordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
    500             wordsFound += 1;
    501         }
    502         // If there was more than one, see which one can take us forward the most words
    503         else if (candidates > 1) {
    504             // If we're already at the end of the range, we're done
    505             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
    506                 goto foundBest;
    507             }
    508             do {
    509                 int wordsMatched = 1;
    510                 if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
    511                     if (wordsMatched < 2) {
    512                         // Followed by another dictionary word; mark first word as a good candidate
    513                         words[wordsFound%LAO_LOOKAHEAD].markCurrent();
    514                         wordsMatched = 2;
    515                     }
    516 
    517                     // If we're already at the end of the range, we're done
    518                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
    519                         goto foundBest;
    520                     }
    521 
    522                     // See if any of the possible second words is followed by a third word
    523                     do {
    524                         // If we find a third word, stop right away
    525                         if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
    526                             words[wordsFound % LAO_LOOKAHEAD].markCurrent();
    527                             goto foundBest;
    528                         }
    529                     }
    530                     while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));
    531                 }
    532             }
    533             while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
    534 foundBest:
    535             wordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
    536             wordsFound += 1;
    537         }
    538 
    539         // We come here after having either found a word or not. We look ahead to the
    540         // next word. If it's not a dictionary word, we will combine it withe the word we
    541         // just found (if there is one), but only if the preceding word does not exceed
    542         // the threshold.
    543         // The text iterator should now be positioned at the end of the word we found.
    544         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < LAO_ROOT_COMBINE_THRESHOLD) {
    545             // if it is a dictionary word, do nothing. If it isn't, then if there is
    546             // no preceding word, or the non-word shares less than the minimum threshold
    547             // of characters with a dictionary word, then scan to resynchronize
    548             if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
    549                   && (wordLength == 0
    550                       || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
    551                 // Look for a plausible word boundary
    552                 //TODO: This section will need a rework for UText.
    553                 int32_t remaining = rangeEnd - (current+wordLength);
    554                 UChar32 pc = utext_current32(text);
    555                 int32_t chars = 0;
    556                 for (;;) {
    557                     utext_next32(text);
    558                     uc = utext_current32(text);
    559                     // TODO: Here we're counting on the fact that the SA languages are all
    560                     // in the BMP. This should get fixed with the UText rework.
    561                     chars += 1;
    562                     if (--remaining <= 0) {
    563                         break;
    564                     }
    565                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
    566                         // Maybe. See if it's in the dictionary.
    567                         int candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    568                         utext_setNativeIndex(text, current + wordLength + chars);
    569                         if (candidates > 0) {
    570                             break;
    571                         }
    572                     }
    573                     pc = uc;
    574                 }
    575 
    576                 // Bump the word count if there wasn't already one
    577                 if (wordLength <= 0) {
    578                     wordsFound += 1;
    579                 }
    580 
    581                 // Update the length with the passed-over characters
    582                 wordLength += chars;
    583             }
    584             else {
    585                 // Back up to where we were for next iteration
    586                 utext_setNativeIndex(text, current+wordLength);
    587             }
    588         }
    589 
    590         // Never stop before a combining mark.
    591         int32_t currPos;
    592         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
    593             utext_next32(text);
    594             wordLength += (int32_t)utext_getNativeIndex(text) - currPos;
    595         }
    596 
    597         // Look ahead for possible suffixes if a dictionary word does not follow.
    598         // We do this in code rather than using a rule so that the heuristic
    599         // resynch continues to function. For example, one of the suffix characters
    600         // could be a typo in the middle of a word.
    601         // NOT CURRENTLY APPLICABLE TO LAO
    602 
    603         // Did we find a word on this iteration? If so, push it on the break stack
    604         if (wordLength > 0) {
    605             foundBreaks.push((current+wordLength), status);
    606         }
    607     }
    608 
    609     // Don't return a break for the end of the dictionary range if there is one there.
    610     if (foundBreaks.peeki() >= rangeEnd) {
    611         (void) foundBreaks.popi();
    612         wordsFound -= 1;
    613     }
    614 
    615     return wordsFound;
    616 }
    617 
    618 /*
    619  ******************************************************************
    620  * KhmerBreakEngine
    621  */
    622 
    623 // How many words in a row are "good enough"?
    624 #define KHMER_LOOKAHEAD 3
    625 
    626 // Will not combine a non-word with a preceding dictionary word longer than this
    627 #define KHMER_ROOT_COMBINE_THRESHOLD 3
    628 
    629 // Will not combine a non-word that shares at least this much prefix with a
    630 // dictionary word, with a preceding word
    631 #define KHMER_PREFIX_COMBINE_THRESHOLD 3
    632 
    633 // Minimum word size
    634 #define KHMER_MIN_WORD 2
    635 
    636 // Minimum number of characters for two words
    637 #define KHMER_MIN_WORD_SPAN (KHMER_MIN_WORD * 2)
    638 
    639 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
    640     : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)),
    641       fDictionary(adoptDictionary)
    642 {
    643     fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
    644     if (U_SUCCESS(status)) {
    645         setCharacters(fKhmerWordSet);
    646     }
    647     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
    648     fMarkSet.add(0x0020);
    649     fEndWordSet = fKhmerWordSet;
    650     fBeginWordSet.add(0x1780, 0x17B3);
    651     //fBeginWordSet.add(0x17A3, 0x17A4);      // deprecated vowels
    652     //fEndWordSet.remove(0x17A5, 0x17A9);     // Khmer independent vowels that can't end a word
    653     //fEndWordSet.remove(0x17B2);             // Khmer independent vowel that can't end a word
    654     fEndWordSet.remove(0x17D2);             // KHMER SIGN COENG that combines some following characters
    655     //fEndWordSet.remove(0x17B6, 0x17C5);     // Remove dependent vowels
    656 //    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
    657 //    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
    658 //    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
    659 //    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
    660 //    fSuffixSet.add(THAI_PAIYANNOI);
    661 //    fSuffixSet.add(THAI_MAIYAMOK);
    662 
    663     // Compact for caching.
    664     fMarkSet.compact();
    665     fEndWordSet.compact();
    666     fBeginWordSet.compact();
    667 //    fSuffixSet.compact();
    668 }
    669 
    670 KhmerBreakEngine::~KhmerBreakEngine() {
    671     delete fDictionary;
    672 }
    673 
    674 int32_t
    675 KhmerBreakEngine::divideUpDictionaryRange( UText *text,
    676                                                 int32_t rangeStart,
    677                                                 int32_t rangeEnd,
    678                                                 UStack &foundBreaks ) const {
    679     if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
    680         return 0;       // Not enough characters for two words
    681     }
    682 
    683     uint32_t wordsFound = 0;
    684     int32_t wordLength;
    685     int32_t current;
    686     UErrorCode status = U_ZERO_ERROR;
    687     PossibleWord words[KHMER_LOOKAHEAD];
    688     UChar32 uc;
    689 
    690     utext_setNativeIndex(text, rangeStart);
    691 
    692     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
    693         wordLength = 0;
    694 
    695         // Look for candidate words at the current position
    696         int candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    697 
    698         // If we found exactly one, use that
    699         if (candidates == 1) {
    700             wordLength = words[wordsFound%KHMER_LOOKAHEAD].acceptMarked(text);
    701             wordsFound += 1;
    702         }
    703 
    704         // If there was more than one, see which one can take us forward the most words
    705         else if (candidates > 1) {
    706             // If we're already at the end of the range, we're done
    707             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
    708                 goto foundBest;
    709             }
    710             do {
    711                 int wordsMatched = 1;
    712                 if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
    713                     if (wordsMatched < 2) {
    714                         // Followed by another dictionary word; mark first word as a good candidate
    715                         words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
    716                         wordsMatched = 2;
    717                     }
    718 
    719                     // If we're already at the end of the range, we're done
    720                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
    721                         goto foundBest;
    722                     }
    723 
    724                     // See if any of the possible second words is followed by a third word
    725                     do {
    726                         // If we find a third word, stop right away
    727                         if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
    728                             words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
    729                             goto foundBest;
    730                         }
    731                     }
    732                     while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
    733                 }
    734             }
    735             while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
    736 foundBest:
    737             wordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
    738             wordsFound += 1;
    739         }
    740 
    741         // We come here after having either found a word or not. We look ahead to the
    742         // next word. If it's not a dictionary word, we will combine it with the word we
    743         // just found (if there is one), but only if the preceding word does not exceed
    744         // the threshold.
    745         // The text iterator should now be positioned at the end of the word we found.
    746         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
    747             // if it is a dictionary word, do nothing. If it isn't, then if there is
    748             // no preceding word, or the non-word shares less than the minimum threshold
    749             // of characters with a dictionary word, then scan to resynchronize
    750             if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
    751                   && (wordLength == 0
    752                       || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
    753                 // Look for a plausible word boundary
    754                 //TODO: This section will need a rework for UText.
    755                 int32_t remaining = rangeEnd - (current+wordLength);
    756                 UChar32 pc = utext_current32(text);
    757                 int32_t chars = 0;
    758                 for (;;) {
    759                     utext_next32(text);
    760                     uc = utext_current32(text);
    761                     // TODO: Here we're counting on the fact that the SA languages are all
    762                     // in the BMP. This should get fixed with the UText rework.
    763                     chars += 1;
    764                     if (--remaining <= 0) {
    765                         break;
    766                     }
    767                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
    768                         // Maybe. See if it's in the dictionary.
    769                         int candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    770                         utext_setNativeIndex(text, current+wordLength+chars);
    771                         if (candidates > 0) {
    772                             break;
    773                         }
    774                     }
    775                     pc = uc;
    776                 }
    777 
    778                 // Bump the word count if there wasn't already one
    779                 if (wordLength <= 0) {
    780                     wordsFound += 1;
    781                 }
    782 
    783                 // Update the length with the passed-over characters
    784                 wordLength += chars;
    785             }
    786             else {
    787                 // Back up to where we were for next iteration
    788                 utext_setNativeIndex(text, current+wordLength);
    789             }
    790         }
    791 
    792         // Never stop before a combining mark.
    793         int32_t currPos;
    794         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
    795             utext_next32(text);
    796             wordLength += (int32_t)utext_getNativeIndex(text) - currPos;
    797         }
    798 
    799         // Look ahead for possible suffixes if a dictionary word does not follow.
    800         // We do this in code rather than using a rule so that the heuristic
    801         // resynch continues to function. For example, one of the suffix characters
    802         // could be a typo in the middle of a word.
    803 //        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
    804 //            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
    805 //                && fSuffixSet.contains(uc = utext_current32(text))) {
    806 //                if (uc == KHMER_PAIYANNOI) {
    807 //                    if (!fSuffixSet.contains(utext_previous32(text))) {
    808 //                        // Skip over previous end and PAIYANNOI
    809 //                        utext_next32(text);
    810 //                        utext_next32(text);
    811 //                        wordLength += 1;            // Add PAIYANNOI to word
    812 //                        uc = utext_current32(text);     // Fetch next character
    813 //                    }
    814 //                    else {
    815 //                        // Restore prior position
    816 //                        utext_next32(text);
    817 //                    }
    818 //                }
    819 //                if (uc == KHMER_MAIYAMOK) {
    820 //                    if (utext_previous32(text) != KHMER_MAIYAMOK) {
    821 //                        // Skip over previous end and MAIYAMOK
    822 //                        utext_next32(text);
    823 //                        utext_next32(text);
    824 //                        wordLength += 1;            // Add MAIYAMOK to word
    825 //                    }
    826 //                    else {
    827 //                        // Restore prior position
    828 //                        utext_next32(text);
    829 //                    }
    830 //                }
    831 //            }
    832 //            else {
    833 //                utext_setNativeIndex(text, current+wordLength);
    834 //            }
    835 //        }
    836 
    837         // Did we find a word on this iteration? If so, push it on the break stack
    838         if (wordLength > 0) {
    839             foundBreaks.push((current+wordLength), status);
    840         }
    841     }
    842 
    843     // Don't return a break for the end of the dictionary range if there is one there.
    844     if (foundBreaks.peeki() >= rangeEnd) {
    845         (void) foundBreaks.popi();
    846         wordsFound -= 1;
    847     }
    848 
    849     return wordsFound;
    850 }
    851 
    852 #if !UCONFIG_NO_NORMALIZATION
    853 /*
    854  ******************************************************************
    855  * CjkBreakEngine
    856  */
    857 static const uint32_t kuint32max = 0xFFFFFFFF;
    858 CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
    859 : DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) {
    860     // Korean dictionary only includes Hangul syllables
    861     fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
    862     fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
    863     fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
    864     fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
    865 
    866     if (U_SUCCESS(status)) {
    867         // handle Korean and Japanese/Chinese using different dictionaries
    868         if (type == kKorean) {
    869             setCharacters(fHangulWordSet);
    870         } else { //Chinese and Japanese
    871             UnicodeSet cjSet;
    872             cjSet.addAll(fHanWordSet);
    873             cjSet.addAll(fKatakanaWordSet);
    874             cjSet.addAll(fHiraganaWordSet);
    875             cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
    876             cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
    877             setCharacters(cjSet);
    878         }
    879     }
    880 }
    881 
    882 CjkBreakEngine::~CjkBreakEngine(){
    883     delete fDictionary;
    884 }
    885 
    886 // The katakanaCost values below are based on the length frequencies of all
    887 // katakana phrases in the dictionary
    888 static const int kMaxKatakanaLength = 8;
    889 static const int kMaxKatakanaGroupLength = 20;
    890 static const uint32_t maxSnlp = 255;
    891 
    892 static inline uint32_t getKatakanaCost(int wordLength){
    893     //TODO: fill array with actual values from dictionary!
    894     static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
    895                                        = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
    896     return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
    897 }
    898 
    899 static inline bool isKatakana(uint16_t value) {
    900     return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) ||
    901             (value >= 0xFF66u && value <= 0xFF9fu);
    902 }
    903 
    904 // A very simple helper class to streamline the buffer handling in
    905 // divideUpDictionaryRange.
    906 template<class T, size_t N>
    907 class AutoBuffer {
    908 public:
    909     AutoBuffer(size_t size) : buffer(stackBuffer), capacity(N) {
    910         if (size > N) {
    911             buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));
    912             capacity = size;
    913         }
    914     }
    915     ~AutoBuffer() {
    916         if (buffer != stackBuffer)
    917             uprv_free(buffer);
    918     }
    919 
    920     T* elems() {
    921         return buffer;
    922     }
    923 
    924     const T& operator[] (size_t i) const {
    925         return buffer[i];
    926     }
    927 
    928     T& operator[] (size_t i) {
    929         return buffer[i];
    930     }
    931 
    932     // resize without copy
    933     void resize(size_t size) {
    934         if (size <= capacity)
    935             return;
    936         if (buffer != stackBuffer)
    937             uprv_free(buffer);
    938         buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));
    939         capacity = size;
    940     }
    941 
    942 private:
    943     T stackBuffer[N];
    944     T* buffer;
    945     AutoBuffer();
    946     size_t capacity;
    947 };
    948 
    949 
    950 /*
    951  * @param text A UText representing the text
    952  * @param rangeStart The start of the range of dictionary characters
    953  * @param rangeEnd The end of the range of dictionary characters
    954  * @param foundBreaks Output of C array of int32_t break positions, or 0
    955  * @return The number of breaks found
    956  */
    957 int32_t
    958 CjkBreakEngine::divideUpDictionaryRange( UText *text,
    959         int32_t rangeStart,
    960         int32_t rangeEnd,
    961         UStack &foundBreaks ) const {
    962     if (rangeStart >= rangeEnd) {
    963         return 0;
    964     }
    965 
    966     const size_t defaultInputLength = 80;
    967     size_t inputLength = rangeEnd - rangeStart;
    968     // TODO: Replace by UnicodeString.
    969     AutoBuffer<UChar, defaultInputLength> charString(inputLength);
    970 
    971     // Normalize the input string and put it in normalizedText.
    972     // The map from the indices of the normalized input to the raw
    973     // input is kept in charPositions.
    974     UErrorCode status = U_ZERO_ERROR;
    975     utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, &status);
    976     if (U_FAILURE(status)) {
    977         return 0;
    978     }
    979 
    980     UnicodeString inputString(charString.elems(), inputLength);
    981     // TODO: Use Normalizer2.
    982     UNormalizationMode norm_mode = UNORM_NFKC;
    983     UBool isNormalized =
    984         Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES ||
    985         Normalizer::isNormalized(inputString, norm_mode, status);
    986 
    987     // TODO: Replace by UVector32.
    988     AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1);
    989     int numChars = 0;
    990     UText normalizedText = UTEXT_INITIALIZER;
    991     // Needs to be declared here because normalizedText holds onto its buffer.
    992     UnicodeString normalizedString;
    993     if (isNormalized) {
    994         int32_t index = 0;
    995         charPositions[0] = 0;
    996         while(index < inputString.length()) {
    997             index = inputString.moveIndex32(index, 1);
    998             charPositions[++numChars] = index;
    999         }
   1000         utext_openUnicodeString(&normalizedText, &inputString, &status);
   1001     }
   1002     else {
   1003         Normalizer::normalize(inputString, norm_mode, 0, normalizedString, status);
   1004         if (U_FAILURE(status)) {
   1005             return 0;
   1006         }
   1007         charPositions.resize(normalizedString.length() + 1);
   1008         Normalizer normalizer(charString.elems(), inputLength, norm_mode);
   1009         int32_t index = 0;
   1010         charPositions[0] = 0;
   1011         while(index < normalizer.endIndex()){
   1012             /* UChar32 uc = */ normalizer.next();
   1013             charPositions[++numChars] = index = normalizer.getIndex();
   1014         }
   1015         utext_openUnicodeString(&normalizedText, &normalizedString, &status);
   1016     }
   1017 
   1018     if (U_FAILURE(status)) {
   1019         return 0;
   1020     }
   1021 
   1022     // From this point on, all the indices refer to the indices of
   1023     // the normalized input string.
   1024 
   1025     // bestSnlp[i] is the snlp of the best segmentation of the first i
   1026     // characters in the range to be matched.
   1027     // TODO: Replace by UVector32.
   1028     AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1);
   1029     bestSnlp[0] = 0;
   1030     for(int i = 1; i <= numChars; i++) {
   1031         bestSnlp[i] = kuint32max;
   1032     }
   1033 
   1034     // prev[i] is the index of the last CJK character in the previous word in
   1035     // the best segmentation of the first i characters.
   1036     // TODO: Replace by UVector32.
   1037     AutoBuffer<int, defaultInputLength> prev(numChars + 1);
   1038     for(int i = 0; i <= numChars; i++){
   1039         prev[i] = -1;
   1040     }
   1041 
   1042     const size_t maxWordSize = 20;
   1043     // TODO: Replace both with UVector32.
   1044     AutoBuffer<int32_t, maxWordSize> values(numChars);
   1045     AutoBuffer<int32_t, maxWordSize> lengths(numChars);
   1046 
   1047     // Dynamic programming to find the best segmentation.
   1048     bool is_prev_katakana = false;
   1049     for (int32_t i = 0; i < numChars; ++i) {
   1050         //utext_setNativeIndex(text, rangeStart + i);
   1051         utext_setNativeIndex(&normalizedText, i);
   1052         if (bestSnlp[i] == kuint32max)
   1053             continue;
   1054 
   1055         int32_t count;
   1056         // limit maximum word length matched to size of current substring
   1057         int32_t maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSize : (numChars - i);
   1058 
   1059         fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems());
   1060 
   1061         // if there are no single character matches found in the dictionary
   1062         // starting with this charcter, treat character as a 1-character word
   1063         // with the highest value possible, i.e. the least likely to occur.
   1064         // Exclude Korean characters from this treatment, as they should be left
   1065         // together by default.
   1066         if((count == 0 || lengths[0] != 1) &&
   1067                 !fHangulWordSet.contains(utext_current32(&normalizedText))) {
   1068             values[count] = maxSnlp;
   1069             lengths[count++] = 1;
   1070         }
   1071 
   1072         for (int j = 0; j < count; j++) {
   1073             uint32_t newSnlp = bestSnlp[i] + values[j];
   1074             if (newSnlp < bestSnlp[lengths[j] + i]) {
   1075                 bestSnlp[lengths[j] + i] = newSnlp;
   1076                 prev[lengths[j] + i] = i;
   1077             }
   1078         }
   1079 
   1080         // In Japanese,
   1081         // Katakana word in single character is pretty rare. So we apply
   1082         // the following heuristic to Katakana: any continuous run of Katakana
   1083         // characters is considered a candidate word with a default cost
   1084         // specified in the katakanaCost table according to its length.
   1085         //utext_setNativeIndex(text, rangeStart + i);
   1086         utext_setNativeIndex(&normalizedText, i);
   1087         bool is_katakana = isKatakana(utext_current32(&normalizedText));
   1088         if (!is_prev_katakana && is_katakana) {
   1089             int j = i + 1;
   1090             utext_next32(&normalizedText);
   1091             // Find the end of the continuous run of Katakana characters
   1092             while (j < numChars && (j - i) < kMaxKatakanaGroupLength &&
   1093                     isKatakana(utext_current32(&normalizedText))) {
   1094                 utext_next32(&normalizedText);
   1095                 ++j;
   1096             }
   1097             if ((j - i) < kMaxKatakanaGroupLength) {
   1098                 uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i);
   1099                 if (newSnlp < bestSnlp[j]) {
   1100                     bestSnlp[j] = newSnlp;
   1101                     prev[j] = i;
   1102                 }
   1103             }
   1104         }
   1105         is_prev_katakana = is_katakana;
   1106     }
   1107 
   1108     // Start pushing the optimal offset index into t_boundary (t for tentative).
   1109     // prev[numChars] is guaranteed to be meaningful.
   1110     // We'll first push in the reverse order, i.e.,
   1111     // t_boundary[0] = numChars, and afterwards do a swap.
   1112     // TODO: Replace by UVector32.
   1113     AutoBuffer<int, maxWordSize> t_boundary(numChars + 1);
   1114 
   1115     int numBreaks = 0;
   1116     // No segmentation found, set boundary to end of range
   1117     if (bestSnlp[numChars] == kuint32max) {
   1118         t_boundary[numBreaks++] = numChars;
   1119     } else {
   1120         for (int i = numChars; i > 0; i = prev[i]) {
   1121             t_boundary[numBreaks++] = i;
   1122         }
   1123         U_ASSERT(prev[t_boundary[numBreaks - 1]] == 0);
   1124     }
   1125 
   1126     // Reverse offset index in t_boundary.
   1127     // Don't add a break for the start of the dictionary range if there is one
   1128     // there already.
   1129     if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
   1130         t_boundary[numBreaks++] = 0;
   1131     }
   1132 
   1133     // Now that we're done, convert positions in t_bdry[] (indices in
   1134     // the normalized input string) back to indices in the raw input string
   1135     // while reversing t_bdry and pushing values to foundBreaks.
   1136     for (int i = numBreaks-1; i >= 0; i--) {
   1137         foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status);
   1138     }
   1139 
   1140     utext_close(&normalizedText);
   1141     return numBreaks;
   1142 }
   1143 #endif
   1144 
   1145 U_NAMESPACE_END
   1146 
   1147 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
   1148 
   1149