1 /** 2 ******************************************************************************* 3 * Copyright (C) 2006-2008,2011, International Business Machines Corporation * 4 * and others. All Rights Reserved. * 5 ******************************************************************************* 6 */ 7 8 #include "unicode/utypes.h" 9 10 #if !UCONFIG_NO_BREAK_ITERATION 11 12 #include "brkeng.h" 13 #include "dictbe.h" 14 #include "unicode/uniset.h" 15 #include "unicode/chariter.h" 16 #include "unicode/ubrk.h" 17 #include "uvector.h" 18 #include "triedict.h" 19 20 U_NAMESPACE_BEGIN 21 22 /* 23 ****************************************************************** 24 */ 25 26 /*DictionaryBreakEngine::DictionaryBreakEngine() { 27 fTypes = 0; 28 }*/ 29 30 DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) { 31 fTypes = breakTypes; 32 } 33 34 DictionaryBreakEngine::~DictionaryBreakEngine() { 35 } 36 37 UBool 38 DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const { 39 return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes) 40 && fSet.contains(c)); 41 } 42 43 int32_t 44 DictionaryBreakEngine::findBreaks( UText *text, 45 int32_t startPos, 46 int32_t endPos, 47 UBool reverse, 48 int32_t breakType, 49 UStack &foundBreaks ) const { 50 int32_t result = 0; 51 52 // Find the span of characters included in the set. 53 int32_t start = (int32_t)utext_getNativeIndex(text); 54 int32_t current; 55 int32_t rangeStart; 56 int32_t rangeEnd; 57 UChar32 c = utext_current32(text); 58 if (reverse) { 59 UBool isDict = fSet.contains(c); 60 while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) { 61 c = utext_previous32(text); 62 isDict = fSet.contains(c); 63 } 64 rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1); 65 rangeEnd = start + 1; 66 } 67 else { 68 while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) { 69 utext_next32(text); // TODO: recast loop for postincrement 70 c = utext_current32(text); 71 } 72 rangeStart = start; 73 rangeEnd = current; 74 } 75 if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) { 76 result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks); 77 utext_setNativeIndex(text, current); 78 } 79 80 return result; 81 } 82 83 void 84 DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) { 85 fSet = set; 86 // Compact for caching 87 fSet.compact(); 88 } 89 90 /*void 91 DictionaryBreakEngine::setBreakTypes( uint32_t breakTypes ) { 92 fTypes = breakTypes; 93 }*/ 94 95 /* 96 ****************************************************************** 97 */ 98 99 100 // Helper class for improving readability of the Thai word break 101 // algorithm. The implementation is completely inline. 102 103 // List size, limited by the maximum number of words in the dictionary 104 // that form a nested sequence. 105 #define POSSIBLE_WORD_LIST_MAX 20 106 107 class PossibleWord { 108 private: 109 // list of word candidate lengths, in increasing length order 110 int32_t lengths[POSSIBLE_WORD_LIST_MAX]; 111 int count; // Count of candidates 112 int32_t prefix; // The longest match with a dictionary word 113 int32_t offset; // Offset in the text of these candidates 114 int mark; // The preferred candidate's offset 115 int current; // The candidate we're currently looking at 116 117 public: 118 PossibleWord(); 119 ~PossibleWord(); 120 121 // Fill the list of candidates if needed, select the longest, and return the number found 122 int candidates( UText *text, const TrieWordDictionary *dict, int32_t rangeEnd ); 123 124 // Select the currently marked candidate, point after it in the text, and invalidate self 125 int32_t acceptMarked( UText *text ); 126 127 // Back up from the current candidate to the next shorter one; return TRUE if that exists 128 // and point the text after it 129 UBool backUp( UText *text ); 130 131 // Return the longest prefix this candidate location shares with a dictionary word 132 int32_t longestPrefix(); 133 134 // Mark the current candidate as the one we like 135 void markCurrent(); 136 }; 137 138 inline 139 PossibleWord::PossibleWord() { 140 offset = -1; 141 } 142 143 inline 144 PossibleWord::~PossibleWord() { 145 } 146 147 inline int 148 PossibleWord::candidates( UText *text, const TrieWordDictionary *dict, int32_t rangeEnd ) { 149 // TODO: If getIndex is too slow, use offset < 0 and add discardAll() 150 int32_t start = (int32_t)utext_getNativeIndex(text); 151 if (start != offset) { 152 offset = start; 153 prefix = dict->matches(text, rangeEnd-start, lengths, count, sizeof(lengths)/sizeof(lengths[0])); 154 // Dictionary leaves text after longest prefix, not longest word. Back up. 155 if (count <= 0) { 156 utext_setNativeIndex(text, start); 157 } 158 } 159 if (count > 0) { 160 utext_setNativeIndex(text, start+lengths[count-1]); 161 } 162 current = count-1; 163 mark = current; 164 return count; 165 } 166 167 inline int32_t 168 PossibleWord::acceptMarked( UText *text ) { 169 utext_setNativeIndex(text, offset + lengths[mark]); 170 return lengths[mark]; 171 } 172 173 inline UBool 174 PossibleWord::backUp( UText *text ) { 175 if (current > 0) { 176 utext_setNativeIndex(text, offset + lengths[--current]); 177 return TRUE; 178 } 179 return FALSE; 180 } 181 182 inline int32_t 183 PossibleWord::longestPrefix() { 184 return prefix; 185 } 186 187 inline void 188 PossibleWord::markCurrent() { 189 mark = current; 190 } 191 192 // How many words in a row are "good enough"? 193 #define THAI_LOOKAHEAD 3 194 195 // Will not combine a non-word with a preceding dictionary word longer than this 196 #define THAI_ROOT_COMBINE_THRESHOLD 3 197 198 // Will not combine a non-word that shares at least this much prefix with a 199 // dictionary word, with a preceding word 200 #define THAI_PREFIX_COMBINE_THRESHOLD 3 201 202 // Ellision character 203 #define THAI_PAIYANNOI 0x0E2F 204 205 // Repeat character 206 #define THAI_MAIYAMOK 0x0E46 207 208 // Minimum word size 209 #define THAI_MIN_WORD 2 210 211 // Minimum number of characters for two words 212 #define THAI_MIN_WORD_SPAN (THAI_MIN_WORD * 2) 213 214 ThaiBreakEngine::ThaiBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status) 215 : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)), 216 fDictionary(adoptDictionary) 217 { 218 fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status); 219 if (U_SUCCESS(status)) { 220 setCharacters(fThaiWordSet); 221 } 222 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status); 223 fMarkSet.add(0x0020); 224 fEndWordSet = fThaiWordSet; 225 fEndWordSet.remove(0x0E31); // MAI HAN-AKAT 226 fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 227 fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK 228 fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 229 fSuffixSet.add(THAI_PAIYANNOI); 230 fSuffixSet.add(THAI_MAIYAMOK); 231 232 // Compact for caching. 233 fMarkSet.compact(); 234 fEndWordSet.compact(); 235 fBeginWordSet.compact(); 236 fSuffixSet.compact(); 237 } 238 239 ThaiBreakEngine::~ThaiBreakEngine() { 240 delete fDictionary; 241 } 242 243 int32_t 244 ThaiBreakEngine::divideUpDictionaryRange( UText *text, 245 int32_t rangeStart, 246 int32_t rangeEnd, 247 UStack &foundBreaks ) const { 248 if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) { 249 return 0; // Not enough characters for two words 250 } 251 252 uint32_t wordsFound = 0; 253 int32_t wordLength; 254 int32_t current; 255 UErrorCode status = U_ZERO_ERROR; 256 PossibleWord words[THAI_LOOKAHEAD]; 257 UChar32 uc; 258 259 utext_setNativeIndex(text, rangeStart); 260 261 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { 262 wordLength = 0; 263 264 // Look for candidate words at the current position 265 int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 266 267 // If we found exactly one, use that 268 if (candidates == 1) { 269 wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(text); 270 wordsFound += 1; 271 } 272 273 // If there was more than one, see which one can take us forward the most words 274 else if (candidates > 1) { 275 // If we're already at the end of the range, we're done 276 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 277 goto foundBest; 278 } 279 do { 280 int wordsMatched = 1; 281 if (words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { 282 if (wordsMatched < 2) { 283 // Followed by another dictionary word; mark first word as a good candidate 284 words[wordsFound%THAI_LOOKAHEAD].markCurrent(); 285 wordsMatched = 2; 286 } 287 288 // If we're already at the end of the range, we're done 289 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 290 goto foundBest; 291 } 292 293 // See if any of the possible second words is followed by a third word 294 do { 295 // If we find a third word, stop right away 296 if (words[(wordsFound+2)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { 297 words[wordsFound%THAI_LOOKAHEAD].markCurrent(); 298 goto foundBest; 299 } 300 } 301 while (words[(wordsFound+1)%THAI_LOOKAHEAD].backUp(text)); 302 } 303 } 304 while (words[wordsFound%THAI_LOOKAHEAD].backUp(text)); 305 foundBest: 306 wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(text); 307 wordsFound += 1; 308 } 309 310 // We come here after having either found a word or not. We look ahead to the 311 // next word. If it's not a dictionary word, we will combine it withe the word we 312 // just found (if there is one), but only if the preceding word does not exceed 313 // the threshold. 314 // The text iterator should now be positioned at the end of the word we found. 315 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) { 316 // if it is a dictionary word, do nothing. If it isn't, then if there is 317 // no preceding word, or the non-word shares less than the minimum threshold 318 // of characters with a dictionary word, then scan to resynchronize 319 if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 320 && (wordLength == 0 321 || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) { 322 // Look for a plausible word boundary 323 //TODO: This section will need a rework for UText. 324 int32_t remaining = rangeEnd - (current+wordLength); 325 UChar32 pc = utext_current32(text); 326 int32_t chars = 0; 327 for (;;) { 328 utext_next32(text); 329 uc = utext_current32(text); 330 // TODO: Here we're counting on the fact that the SA languages are all 331 // in the BMP. This should get fixed with the UText rework. 332 chars += 1; 333 if (--remaining <= 0) { 334 break; 335 } 336 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 337 // Maybe. See if it's in the dictionary. 338 // NOTE: In the original Apple code, checked that the next 339 // two characters after uc were not 0x0E4C THANTHAKHAT before 340 // checking the dictionary. That is just a performance filter, 341 // but it's not clear it's faster than checking the trie. 342 int candidates = words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 343 utext_setNativeIndex(text, current+wordLength+chars); 344 if (candidates > 0) { 345 break; 346 } 347 } 348 pc = uc; 349 } 350 351 // Bump the word count if there wasn't already one 352 if (wordLength <= 0) { 353 wordsFound += 1; 354 } 355 356 // Update the length with the passed-over characters 357 wordLength += chars; 358 } 359 else { 360 // Back up to where we were for next iteration 361 utext_setNativeIndex(text, current+wordLength); 362 } 363 } 364 365 // Never stop before a combining mark. 366 int32_t currPos; 367 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { 368 utext_next32(text); 369 wordLength += (int32_t)utext_getNativeIndex(text) - currPos; 370 } 371 372 // Look ahead for possible suffixes if a dictionary word does not follow. 373 // We do this in code rather than using a rule so that the heuristic 374 // resynch continues to function. For example, one of the suffix characters 375 // could be a typo in the middle of a word. 376 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { 377 if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 378 && fSuffixSet.contains(uc = utext_current32(text))) { 379 if (uc == THAI_PAIYANNOI) { 380 if (!fSuffixSet.contains(utext_previous32(text))) { 381 // Skip over previous end and PAIYANNOI 382 utext_next32(text); 383 utext_next32(text); 384 wordLength += 1; // Add PAIYANNOI to word 385 uc = utext_current32(text); // Fetch next character 386 } 387 else { 388 // Restore prior position 389 utext_next32(text); 390 } 391 } 392 if (uc == THAI_MAIYAMOK) { 393 if (utext_previous32(text) != THAI_MAIYAMOK) { 394 // Skip over previous end and MAIYAMOK 395 utext_next32(text); 396 utext_next32(text); 397 wordLength += 1; // Add MAIYAMOK to word 398 } 399 else { 400 // Restore prior position 401 utext_next32(text); 402 } 403 } 404 } 405 else { 406 utext_setNativeIndex(text, current+wordLength); 407 } 408 } 409 410 // Did we find a word on this iteration? If so, push it on the break stack 411 if (wordLength > 0) { 412 foundBreaks.push((current+wordLength), status); 413 } 414 } 415 416 // Don't return a break for the end of the dictionary range if there is one there. 417 if (foundBreaks.peeki() >= rangeEnd) { 418 (void) foundBreaks.popi(); 419 wordsFound -= 1; 420 } 421 422 return wordsFound; 423 } 424 425 // How many words in a row are "good enough"? 426 #define KHMER_LOOKAHEAD 3 427 428 // Will not combine a non-word with a preceding dictionary word longer than this 429 #define KHMER_ROOT_COMBINE_THRESHOLD 3 430 431 // Will not combine a non-word that shares at least this much prefix with a 432 // dictionary word, with a preceding word 433 #define KHMER_PREFIX_COMBINE_THRESHOLD 3 434 435 // Minimum word size 436 #define KHMER_MIN_WORD 2 437 438 // Minimum number of characters for two words 439 #define KHMER_MIN_WORD_SPAN (KHMER_MIN_WORD * 2) 440 441 KhmerBreakEngine::KhmerBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status) 442 : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)), 443 fDictionary(adoptDictionary) 444 { 445 fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status); 446 if (U_SUCCESS(status)) { 447 setCharacters(fKhmerWordSet); 448 } 449 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status); 450 fMarkSet.add(0x0020); 451 fEndWordSet = fKhmerWordSet; 452 fBeginWordSet.add(0x1780, 0x17B3); 453 //fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels 454 //fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word 455 //fEndWordSet.remove(0x17B2); // Khmer independent vowel that can't end a word 456 fEndWordSet.remove(0x17D2); // KHMER SIGN COENG that combines some following characters 457 //fEndWordSet.remove(0x17B6, 0x17C5); // Remove dependent vowels 458 // fEndWordSet.remove(0x0E31); // MAI HAN-AKAT 459 // fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 460 // fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK 461 // fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 462 // fSuffixSet.add(THAI_PAIYANNOI); 463 // fSuffixSet.add(THAI_MAIYAMOK); 464 465 // Compact for caching. 466 fMarkSet.compact(); 467 fEndWordSet.compact(); 468 fBeginWordSet.compact(); 469 // fSuffixSet.compact(); 470 } 471 472 KhmerBreakEngine::~KhmerBreakEngine() { 473 delete fDictionary; 474 } 475 476 int32_t 477 KhmerBreakEngine::divideUpDictionaryRange( UText *text, 478 int32_t rangeStart, 479 int32_t rangeEnd, 480 UStack &foundBreaks ) const { 481 if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) { 482 return 0; // Not enough characters for two words 483 } 484 485 uint32_t wordsFound = 0; 486 int32_t wordLength; 487 int32_t current; 488 UErrorCode status = U_ZERO_ERROR; 489 PossibleWord words[KHMER_LOOKAHEAD]; 490 UChar32 uc; 491 492 utext_setNativeIndex(text, rangeStart); 493 494 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { 495 wordLength = 0; 496 497 // Look for candidate words at the current position 498 int candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 499 500 // If we found exactly one, use that 501 if (candidates == 1) { 502 wordLength = words[wordsFound%KHMER_LOOKAHEAD].acceptMarked(text); 503 wordsFound += 1; 504 } 505 506 // If there was more than one, see which one can take us forward the most words 507 else if (candidates > 1) { 508 // If we're already at the end of the range, we're done 509 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 510 goto foundBest; 511 } 512 do { 513 int wordsMatched = 1; 514 if (words[(wordsFound+1)%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { 515 if (wordsMatched < 2) { 516 // Followed by another dictionary word; mark first word as a good candidate 517 words[wordsFound%KHMER_LOOKAHEAD].markCurrent(); 518 wordsMatched = 2; 519 } 520 521 // If we're already at the end of the range, we're done 522 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 523 goto foundBest; 524 } 525 526 // See if any of the possible second words is followed by a third word 527 do { 528 // If we find a third word, stop right away 529 if (words[(wordsFound+2)%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { 530 words[wordsFound%KHMER_LOOKAHEAD].markCurrent(); 531 goto foundBest; 532 } 533 } 534 while (words[(wordsFound+1)%KHMER_LOOKAHEAD].backUp(text)); 535 } 536 } 537 while (words[wordsFound%KHMER_LOOKAHEAD].backUp(text)); 538 foundBest: 539 wordLength = words[wordsFound%KHMER_LOOKAHEAD].acceptMarked(text); 540 wordsFound += 1; 541 } 542 543 // We come here after having either found a word or not. We look ahead to the 544 // next word. If it's not a dictionary word, we will combine it with the word we 545 // just found (if there is one), but only if the preceding word does not exceed 546 // the threshold. 547 // The text iterator should now be positioned at the end of the word we found. 548 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < KHMER_ROOT_COMBINE_THRESHOLD) { 549 // if it is a dictionary word, do nothing. If it isn't, then if there is 550 // no preceding word, or the non-word shares less than the minimum threshold 551 // of characters with a dictionary word, then scan to resynchronize 552 if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 553 && (wordLength == 0 554 || words[wordsFound%KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) { 555 // Look for a plausible word boundary 556 //TODO: This section will need a rework for UText. 557 int32_t remaining = rangeEnd - (current+wordLength); 558 UChar32 pc = utext_current32(text); 559 int32_t chars = 0; 560 for (;;) { 561 utext_next32(text); 562 uc = utext_current32(text); 563 // TODO: Here we're counting on the fact that the SA languages are all 564 // in the BMP. This should get fixed with the UText rework. 565 chars += 1; 566 if (--remaining <= 0) { 567 break; 568 } 569 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 570 // Maybe. See if it's in the dictionary. 571 int candidates = words[(wordsFound+1)%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 572 utext_setNativeIndex(text, current+wordLength+chars); 573 if (candidates > 0) { 574 break; 575 } 576 } 577 pc = uc; 578 } 579 580 // Bump the word count if there wasn't already one 581 if (wordLength <= 0) { 582 wordsFound += 1; 583 } 584 585 // Update the length with the passed-over characters 586 wordLength += chars; 587 } 588 else { 589 // Back up to where we were for next iteration 590 utext_setNativeIndex(text, current+wordLength); 591 } 592 } 593 594 // Never stop before a combining mark. 595 int32_t currPos; 596 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { 597 utext_next32(text); 598 wordLength += (int32_t)utext_getNativeIndex(text) - currPos; 599 } 600 601 // Look ahead for possible suffixes if a dictionary word does not follow. 602 // We do this in code rather than using a rule so that the heuristic 603 // resynch continues to function. For example, one of the suffix characters 604 // could be a typo in the middle of a word. 605 // if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { 606 // if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 607 // && fSuffixSet.contains(uc = utext_current32(text))) { 608 // if (uc == KHMER_PAIYANNOI) { 609 // if (!fSuffixSet.contains(utext_previous32(text))) { 610 // // Skip over previous end and PAIYANNOI 611 // utext_next32(text); 612 // utext_next32(text); 613 // wordLength += 1; // Add PAIYANNOI to word 614 // uc = utext_current32(text); // Fetch next character 615 // } 616 // else { 617 // // Restore prior position 618 // utext_next32(text); 619 // } 620 // } 621 // if (uc == KHMER_MAIYAMOK) { 622 // if (utext_previous32(text) != KHMER_MAIYAMOK) { 623 // // Skip over previous end and MAIYAMOK 624 // utext_next32(text); 625 // utext_next32(text); 626 // wordLength += 1; // Add MAIYAMOK to word 627 // } 628 // else { 629 // // Restore prior position 630 // utext_next32(text); 631 // } 632 // } 633 // } 634 // else { 635 // utext_setNativeIndex(text, current+wordLength); 636 // } 637 // } 638 639 // Did we find a word on this iteration? If so, push it on the break stack 640 if (wordLength > 0) { 641 foundBreaks.push((current+wordLength), status); 642 } 643 } 644 645 // Don't return a break for the end of the dictionary range if there is one there. 646 if (foundBreaks.peeki() >= rangeEnd) { 647 (void) foundBreaks.popi(); 648 wordsFound -= 1; 649 } 650 651 return wordsFound; 652 } 653 654 U_NAMESPACE_END 655 656 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 657