1 /** 2 ******************************************************************************* 3 * Copyright (C) 2006-2008, International Business Machines Corporation and others. * 4 * All Rights Reserved. * 5 ******************************************************************************* 6 */ 7 8 #include "unicode/utypes.h" 9 10 #if !UCONFIG_NO_BREAK_ITERATION 11 12 #include "brkeng.h" 13 #include "dictbe.h" 14 #include "unicode/uniset.h" 15 #include "unicode/chariter.h" 16 #include "unicode/ubrk.h" 17 #include "uvector.h" 18 #include "triedict.h" 19 #include "uassert.h" 20 #include "unicode/normlzr.h" 21 #include "cmemory.h" 22 23 U_NAMESPACE_BEGIN 24 25 /* 26 ****************************************************************** 27 */ 28 29 /*DictionaryBreakEngine::DictionaryBreakEngine() { 30 fTypes = 0; 31 }*/ 32 33 DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) { 34 fTypes = breakTypes; 35 } 36 37 DictionaryBreakEngine::~DictionaryBreakEngine() { 38 } 39 40 UBool 41 DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const { 42 return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes) 43 && fSet.contains(c)); 44 } 45 46 int32_t 47 DictionaryBreakEngine::findBreaks( UText *text, 48 int32_t startPos, 49 int32_t endPos, 50 UBool reverse, 51 int32_t breakType, 52 UStack &foundBreaks ) const { 53 int32_t result = 0; 54 55 // Find the span of characters included in the set. 56 int32_t start = (int32_t)utext_getNativeIndex(text); 57 int32_t current; 58 int32_t rangeStart; 59 int32_t rangeEnd; 60 UChar32 c = utext_current32(text); 61 if (reverse) { 62 UBool isDict = fSet.contains(c); 63 while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) { 64 c = utext_previous32(text); 65 isDict = fSet.contains(c); 66 } 67 rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1); 68 rangeEnd = start + 1; 69 } 70 else { 71 while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) { 72 utext_next32(text); // TODO: recast loop for postincrement 73 c = utext_current32(text); 74 } 75 rangeStart = start; 76 rangeEnd = current; 77 } 78 if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) { 79 result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks); 80 utext_setNativeIndex(text, current); 81 } 82 83 return result; 84 } 85 86 void 87 DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) { 88 fSet = set; 89 // Compact for caching 90 fSet.compact(); 91 } 92 93 /*void 94 DictionaryBreakEngine::setBreakTypes( uint32_t breakTypes ) { 95 fTypes = breakTypes; 96 }*/ 97 98 /* 99 ****************************************************************** 100 */ 101 102 103 // Helper class for improving readability of the Thai word break 104 // algorithm. The implementation is completely inline. 105 106 // List size, limited by the maximum number of words in the dictionary 107 // that form a nested sequence. 108 #define POSSIBLE_WORD_LIST_MAX 20 109 110 class PossibleWord { 111 private: 112 // list of word candidate lengths, in increasing length order 113 int32_t lengths[POSSIBLE_WORD_LIST_MAX]; 114 int count; // Count of candidates 115 int32_t prefix; // The longest match with a dictionary word 116 int32_t offset; // Offset in the text of these candidates 117 int mark; // The preferred candidate's offset 118 int current; // The candidate we're currently looking at 119 120 public: 121 PossibleWord(); 122 ~PossibleWord(); 123 124 // Fill the list of candidates if needed, select the longest, and return the number found 125 int candidates( UText *text, const TrieWordDictionary *dict, int32_t rangeEnd ); 126 127 // Select the currently marked candidate, point after it in the text, and invalidate self 128 int32_t acceptMarked( UText *text ); 129 130 // Back up from the current candidate to the next shorter one; return TRUE if that exists 131 // and point the text after it 132 UBool backUp( UText *text ); 133 134 // Return the longest prefix this candidate location shares with a dictionary word 135 int32_t longestPrefix(); 136 137 // Mark the current candidate as the one we like 138 void markCurrent(); 139 }; 140 141 inline 142 PossibleWord::PossibleWord() { 143 offset = -1; 144 } 145 146 inline 147 PossibleWord::~PossibleWord() { 148 } 149 150 inline int 151 PossibleWord::candidates( UText *text, const TrieWordDictionary *dict, int32_t rangeEnd ) { 152 // TODO: If getIndex is too slow, use offset < 0 and add discardAll() 153 int32_t start = (int32_t)utext_getNativeIndex(text); 154 if (start != offset) { 155 offset = start; 156 prefix = dict->matches(text, rangeEnd-start, lengths, count, sizeof(lengths)/sizeof(lengths[0])); 157 // Dictionary leaves text after longest prefix, not longest word. Back up. 158 if (count <= 0) { 159 utext_setNativeIndex(text, start); 160 } 161 } 162 if (count > 0) { 163 utext_setNativeIndex(text, start+lengths[count-1]); 164 } 165 current = count-1; 166 mark = current; 167 return count; 168 } 169 170 inline int32_t 171 PossibleWord::acceptMarked( UText *text ) { 172 utext_setNativeIndex(text, offset + lengths[mark]); 173 return lengths[mark]; 174 } 175 176 inline UBool 177 PossibleWord::backUp( UText *text ) { 178 if (current > 0) { 179 utext_setNativeIndex(text, offset + lengths[--current]); 180 return TRUE; 181 } 182 return FALSE; 183 } 184 185 inline int32_t 186 PossibleWord::longestPrefix() { 187 return prefix; 188 } 189 190 inline void 191 PossibleWord::markCurrent() { 192 mark = current; 193 } 194 195 // How many words in a row are "good enough"? 196 #define THAI_LOOKAHEAD 3 197 198 // Will not combine a non-word with a preceding dictionary word longer than this 199 #define THAI_ROOT_COMBINE_THRESHOLD 3 200 201 // Will not combine a non-word that shares at least this much prefix with a 202 // dictionary word, with a preceding word 203 #define THAI_PREFIX_COMBINE_THRESHOLD 3 204 205 // Ellision character 206 #define THAI_PAIYANNOI 0x0E2F 207 208 // Repeat character 209 #define THAI_MAIYAMOK 0x0E46 210 211 // Minimum word size 212 #define THAI_MIN_WORD 2 213 214 // Minimum number of characters for two words 215 #define THAI_MIN_WORD_SPAN (THAI_MIN_WORD * 2) 216 217 ThaiBreakEngine::ThaiBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status) 218 : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)), 219 fDictionary(adoptDictionary) 220 { 221 fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status); 222 if (U_SUCCESS(status)) { 223 setCharacters(fThaiWordSet); 224 } 225 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status); 226 fMarkSet.add(0x0020); 227 fEndWordSet = fThaiWordSet; 228 fEndWordSet.remove(0x0E31); // MAI HAN-AKAT 229 fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 230 fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK 231 fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 232 fSuffixSet.add(THAI_PAIYANNOI); 233 fSuffixSet.add(THAI_MAIYAMOK); 234 235 // Compact for caching. 236 fMarkSet.compact(); 237 fEndWordSet.compact(); 238 fBeginWordSet.compact(); 239 fSuffixSet.compact(); 240 } 241 242 ThaiBreakEngine::~ThaiBreakEngine() { 243 delete fDictionary; 244 } 245 246 int32_t 247 ThaiBreakEngine::divideUpDictionaryRange( UText *text, 248 int32_t rangeStart, 249 int32_t rangeEnd, 250 UStack &foundBreaks ) const { 251 if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) { 252 return 0; // Not enough characters for two words 253 } 254 255 uint32_t wordsFound = 0; 256 int32_t wordLength; 257 int32_t current; 258 UErrorCode status = U_ZERO_ERROR; 259 PossibleWord words[THAI_LOOKAHEAD]; 260 UChar32 uc; 261 262 utext_setNativeIndex(text, rangeStart); 263 264 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { 265 wordLength = 0; 266 267 // Look for candidate words at the current position 268 int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 269 270 // If we found exactly one, use that 271 if (candidates == 1) { 272 wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(text); 273 wordsFound += 1; 274 } 275 276 // If there was more than one, see which one can take us forward the most words 277 else if (candidates > 1) { 278 // If we're already at the end of the range, we're done 279 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 280 goto foundBest; 281 } 282 do { 283 int wordsMatched = 1; 284 if (words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { 285 if (wordsMatched < 2) { 286 // Followed by another dictionary word; mark first word as a good candidate 287 words[wordsFound%THAI_LOOKAHEAD].markCurrent(); 288 wordsMatched = 2; 289 } 290 291 // If we're already at the end of the range, we're done 292 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 293 goto foundBest; 294 } 295 296 // See if any of the possible second words is followed by a third word 297 do { 298 // If we find a third word, stop right away 299 if (words[(wordsFound+2)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { 300 words[wordsFound%THAI_LOOKAHEAD].markCurrent(); 301 goto foundBest; 302 } 303 } 304 while (words[(wordsFound+1)%THAI_LOOKAHEAD].backUp(text)); 305 } 306 } 307 while (words[wordsFound%THAI_LOOKAHEAD].backUp(text)); 308 foundBest: 309 wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(text); 310 wordsFound += 1; 311 } 312 313 // We come here after having either found a word or not. We look ahead to the 314 // next word. If it's not a dictionary word, we will combine it withe the word we 315 // just found (if there is one), but only if the preceding word does not exceed 316 // the threshold. 317 // The text iterator should now be positioned at the end of the word we found. 318 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) { 319 // if it is a dictionary word, do nothing. If it isn't, then if there is 320 // no preceding word, or the non-word shares less than the minimum threshold 321 // of characters with a dictionary word, then scan to resynchronize 322 if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 323 && (wordLength == 0 324 || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) { 325 // Look for a plausible word boundary 326 //TODO: This section will need a rework for UText. 327 int32_t remaining = rangeEnd - (current+wordLength); 328 UChar32 pc = utext_current32(text); 329 int32_t chars = 0; 330 for (;;) { 331 utext_next32(text); 332 uc = utext_current32(text); 333 // TODO: Here we're counting on the fact that the SA languages are all 334 // in the BMP. This should get fixed with the UText rework. 335 chars += 1; 336 if (--remaining <= 0) { 337 break; 338 } 339 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 340 // Maybe. See if it's in the dictionary. 341 // NOTE: In the original Apple code, checked that the next 342 // two characters after uc were not 0x0E4C THANTHAKHAT before 343 // checking the dictionary. That is just a performance filter, 344 // but it's not clear it's faster than checking the trie. 345 int candidates = words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 346 utext_setNativeIndex(text, current+wordLength+chars); 347 if (candidates > 0) { 348 break; 349 } 350 } 351 pc = uc; 352 } 353 354 // Bump the word count if there wasn't already one 355 if (wordLength <= 0) { 356 wordsFound += 1; 357 } 358 359 // Update the length with the passed-over characters 360 wordLength += chars; 361 } 362 else { 363 // Back up to where we were for next iteration 364 utext_setNativeIndex(text, current+wordLength); 365 } 366 } 367 368 // Never stop before a combining mark. 369 int32_t currPos; 370 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { 371 utext_next32(text); 372 wordLength += (int32_t)utext_getNativeIndex(text) - currPos; 373 } 374 375 // Look ahead for possible suffixes if a dictionary word does not follow. 376 // We do this in code rather than using a rule so that the heuristic 377 // resynch continues to function. For example, one of the suffix characters 378 // could be a typo in the middle of a word. 379 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { 380 if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 381 && fSuffixSet.contains(uc = utext_current32(text))) { 382 if (uc == THAI_PAIYANNOI) { 383 if (!fSuffixSet.contains(utext_previous32(text))) { 384 // Skip over previous end and PAIYANNOI 385 utext_next32(text); 386 utext_next32(text); 387 wordLength += 1; // Add PAIYANNOI to word 388 uc = utext_current32(text); // Fetch next character 389 } 390 else { 391 // Restore prior position 392 utext_next32(text); 393 } 394 } 395 if (uc == THAI_MAIYAMOK) { 396 if (utext_previous32(text) != THAI_MAIYAMOK) { 397 // Skip over previous end and MAIYAMOK 398 utext_next32(text); 399 utext_next32(text); 400 wordLength += 1; // Add MAIYAMOK to word 401 } 402 else { 403 // Restore prior position 404 utext_next32(text); 405 } 406 } 407 } 408 else { 409 utext_setNativeIndex(text, current+wordLength); 410 } 411 } 412 413 // Did we find a word on this iteration? If so, push it on the break stack 414 if (wordLength > 0) { 415 foundBreaks.push((current+wordLength), status); 416 } 417 } 418 419 // Don't return a break for the end of the dictionary range if there is one there. 420 if (foundBreaks.peeki() >= rangeEnd) { 421 (void) foundBreaks.popi(); 422 wordsFound -= 1; 423 } 424 425 return wordsFound; 426 } 427 428 /* 429 ****************************************************************** 430 * CjkBreakEngine 431 */ 432 static const uint32_t kuint32max = 0xFFFFFFFF; 433 CjkBreakEngine::CjkBreakEngine(const TrieWordDictionary *adoptDictionary, LanguageType type, UErrorCode &status) 434 : DictionaryBreakEngine(1<<UBRK_WORD), fDictionary(adoptDictionary){ 435 if (!adoptDictionary->getValued()) { 436 status = U_ILLEGAL_ARGUMENT_ERROR; 437 return; 438 } 439 440 // Korean dictionary only includes Hangul syllables 441 fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status); 442 fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status); 443 fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status); 444 fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status); 445 446 if (U_SUCCESS(status)) { 447 // handle Korean and Japanese/Chinese using different dictionaries 448 if (type == kKorean) { 449 setCharacters(fHangulWordSet); 450 } else { //Chinese and Japanese 451 UnicodeSet cjSet; 452 cjSet.addAll(fHanWordSet); 453 cjSet.addAll(fKatakanaWordSet); 454 cjSet.addAll(fHiraganaWordSet); 455 cjSet.add(UNICODE_STRING_SIMPLE("\\uff70\\u30fc")); 456 setCharacters(cjSet); 457 } 458 } 459 } 460 461 CjkBreakEngine::~CjkBreakEngine(){ 462 delete fDictionary; 463 } 464 465 // The katakanaCost values below are based on the length frequencies of all 466 // katakana phrases in the dictionary 467 static const int kMaxKatakanaLength = 8; 468 static const int kMaxKatakanaGroupLength = 20; 469 static const uint32_t maxSnlp = 255; 470 471 static inline uint32_t getKatakanaCost(int wordLength){ 472 //TODO: fill array with actual values from dictionary! 473 static const uint32_t katakanaCost[kMaxKatakanaLength + 1] 474 = {8192, 984, 408, 240, 204, 252, 300, 372, 480}; 475 return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength]; 476 } 477 478 static inline bool isKatakana(uint16_t value) { 479 return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) || 480 (value >= 0xFF66u && value <= 0xFF9fu); 481 } 482 483 // A very simple helper class to streamline the buffer handling in 484 // divideUpDictionaryRange. 485 template<class T, size_t N> 486 class AutoBuffer { 487 public: 488 AutoBuffer(size_t size) : buffer(stackBuffer), capacity(N) { 489 if (size > N) { 490 buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size)); 491 capacity = size; 492 } 493 } 494 ~AutoBuffer() { 495 if (buffer != stackBuffer) 496 uprv_free(buffer); 497 } 498 #if 0 499 T* operator& () { 500 return buffer; 501 } 502 #endif 503 T* elems() { 504 return buffer; 505 } 506 const T& operator[] (size_t i) const { 507 return buffer[i]; 508 } 509 T& operator[] (size_t i) { 510 return buffer[i]; 511 } 512 513 // resize without copy 514 void resize(size_t size) { 515 if (size <= capacity) 516 return; 517 if (buffer != stackBuffer) 518 uprv_free(buffer); 519 buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size)); 520 capacity = size; 521 } 522 private: 523 T stackBuffer[N]; 524 T* buffer; 525 AutoBuffer(); 526 size_t capacity; 527 }; 528 529 530 /* 531 * @param text A UText representing the text 532 * @param rangeStart The start of the range of dictionary characters 533 * @param rangeEnd The end of the range of dictionary characters 534 * @param foundBreaks Output of C array of int32_t break positions, or 0 535 * @return The number of breaks found 536 */ 537 int32_t 538 CjkBreakEngine::divideUpDictionaryRange( UText *text, 539 int32_t rangeStart, 540 int32_t rangeEnd, 541 UStack &foundBreaks ) const { 542 if (rangeStart >= rangeEnd) { 543 return 0; 544 } 545 546 const size_t defaultInputLength = 80; 547 size_t inputLength = rangeEnd - rangeStart; 548 AutoBuffer<UChar, defaultInputLength> charString(inputLength); 549 550 // Normalize the input string and put it in normalizedText. 551 // The map from the indices of the normalized input to the raw 552 // input is kept in charPositions. 553 UErrorCode status = U_ZERO_ERROR; 554 utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, &status); 555 if (U_FAILURE(status)) 556 return 0; 557 558 UnicodeString inputString(charString.elems(), inputLength); 559 UNormalizationMode norm_mode = UNORM_NFKC; 560 UBool isNormalized = 561 Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES || 562 Normalizer::isNormalized(inputString, norm_mode, status); 563 564 AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1); 565 int numChars = 0; 566 UText normalizedText = UTEXT_INITIALIZER; 567 // Needs to be declared here because normalizedText holds onto its buffer. 568 UnicodeString normalizedString; 569 if (isNormalized) { 570 int32_t index = 0; 571 charPositions[0] = 0; 572 while(index < inputString.length()) { 573 index = inputString.moveIndex32(index, 1); 574 charPositions[++numChars] = index; 575 } 576 utext_openUnicodeString(&normalizedText, &inputString, &status); 577 } 578 else { 579 Normalizer::normalize(inputString, norm_mode, 0, normalizedString, status); 580 if (U_FAILURE(status)) 581 return 0; 582 charPositions.resize(normalizedString.length() + 1); 583 Normalizer normalizer(charString.elems(), inputLength, norm_mode); 584 int32_t index = 0; 585 charPositions[0] = 0; 586 while(index < normalizer.endIndex()){ 587 UChar32 uc = normalizer.next(); 588 charPositions[++numChars] = index = normalizer.getIndex(); 589 } 590 utext_openUnicodeString(&normalizedText, &normalizedString, &status); 591 } 592 593 if (U_FAILURE(status)) 594 return 0; 595 596 // From this point on, all the indices refer to the indices of 597 // the normalized input string. 598 599 // bestSnlp[i] is the snlp of the best segmentation of the first i 600 // characters in the range to be matched. 601 AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1); 602 bestSnlp[0] = 0; 603 for(int i=1; i<=numChars; i++){ 604 bestSnlp[i] = kuint32max; 605 } 606 607 // prev[i] is the index of the last CJK character in the previous word in 608 // the best segmentation of the first i characters. 609 AutoBuffer<int, defaultInputLength> prev(numChars + 1); 610 for(int i=0; i<=numChars; i++){ 611 prev[i] = -1; 612 } 613 614 const size_t maxWordSize = 20; 615 AutoBuffer<uint16_t, maxWordSize> values(numChars); 616 AutoBuffer<int32_t, maxWordSize> lengths(numChars); 617 618 // Dynamic programming to find the best segmentation. 619 bool is_prev_katakana = false; 620 for (int i = 0; i < numChars; ++i) { 621 //utext_setNativeIndex(text, rangeStart + i); 622 utext_setNativeIndex(&normalizedText, i); 623 if (bestSnlp[i] == kuint32max) 624 continue; 625 626 int count; 627 // limit maximum word length matched to size of current substring 628 int maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSize: numChars - i; 629 630 fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems()); 631 632 // if there are no single character matches found in the dictionary 633 // starting with this charcter, treat character as a 1-character word 634 // with the highest value possible, i.e. the least likely to occur. 635 // Exclude Korean characters from this treatment, as they should be left 636 // together by default. 637 if((count == 0 || lengths[0] != 1) && 638 !fHangulWordSet.contains(utext_current32(&normalizedText))){ 639 values[count] = maxSnlp; 640 lengths[count++] = 1; 641 } 642 643 for (int j = 0; j < count; j++){ 644 //U_ASSERT(values[j] >= 0 && values[j] <= maxSnlp); 645 uint32_t newSnlp = bestSnlp[i] + values[j]; 646 if (newSnlp < bestSnlp[lengths[j] + i]) { 647 bestSnlp[lengths[j] + i] = newSnlp; 648 prev[lengths[j] + i] = i; 649 } 650 } 651 652 // In Japanese, 653 // Katakana word in single character is pretty rare. So we apply 654 // the following heuristic to Katakana: any continuous run of Katakana 655 // characters is considered a candidate word with a default cost 656 // specified in the katakanaCost table according to its length. 657 //utext_setNativeIndex(text, rangeStart + i); 658 utext_setNativeIndex(&normalizedText, i); 659 bool is_katakana = isKatakana(utext_current32(&normalizedText)); 660 if (!is_prev_katakana && is_katakana) { 661 int j = i + 1; 662 utext_next32(&normalizedText); 663 // Find the end of the continuous run of Katakana characters 664 while (j < numChars && (j - i) < kMaxKatakanaGroupLength && 665 isKatakana(utext_current32(&normalizedText))) { 666 utext_next32(&normalizedText); 667 ++j; 668 } 669 if ((j - i) < kMaxKatakanaGroupLength) { 670 uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i); 671 if (newSnlp < bestSnlp[j]) { 672 bestSnlp[j] = newSnlp; 673 prev[j] = i; 674 } 675 } 676 } 677 is_prev_katakana = is_katakana; 678 } 679 680 // Start pushing the optimal offset index into t_boundary (t for tentative). 681 // prev[numChars] is guaranteed to be meaningful. 682 // We'll first push in the reverse order, i.e., 683 // t_boundary[0] = numChars, and afterwards do a swap. 684 AutoBuffer<int, maxWordSize> t_boundary(numChars + 1); 685 686 int numBreaks = 0; 687 // No segmentation found, set boundary to end of range 688 if (bestSnlp[numChars] == kuint32max) { 689 t_boundary[numBreaks++] = numChars; 690 } else { 691 for (int i = numChars; i > 0; i = prev[i]){ 692 t_boundary[numBreaks++] = i; 693 694 } 695 U_ASSERT(prev[t_boundary[numBreaks-1]] == 0); 696 } 697 698 // Reverse offset index in t_boundary. 699 // Don't add a break for the start of the dictionary range if there is one 700 // there already. 701 if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) { 702 t_boundary[numBreaks++] = 0; 703 } 704 705 // Now that we're done, convert positions in t_bdry[] (indices in 706 // the normalized input string) back to indices in the raw input string 707 // while reversing t_bdry and pushing values to foundBreaks. 708 for (int i = numBreaks-1; i >= 0; i--) { 709 foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status); 710 } 711 712 utext_close(&normalizedText); 713 return numBreaks; 714 } 715 716 U_NAMESPACE_END 717 718 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 719