1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /** 4 ******************************************************************************* 5 * Copyright (C) 2006-2016, International Business Machines Corporation 6 * and others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10 #include "unicode/utypes.h" 11 12 #if !UCONFIG_NO_BREAK_ITERATION 13 14 #include "brkeng.h" 15 #include "dictbe.h" 16 #include "unicode/uniset.h" 17 #include "unicode/chariter.h" 18 #include "unicode/ubrk.h" 19 #include "uvectr32.h" 20 #include "uvector.h" 21 #include "uassert.h" 22 #include "unicode/normlzr.h" 23 #include "cmemory.h" 24 #include "dictionarydata.h" 25 26 U_NAMESPACE_BEGIN 27 28 /* 29 ****************************************************************** 30 */ 31 32 DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) { 33 fTypes = breakTypes; 34 } 35 36 DictionaryBreakEngine::~DictionaryBreakEngine() { 37 } 38 39 UBool 40 DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const { 41 return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes) 42 && fSet.contains(c)); 43 } 44 45 int32_t 46 DictionaryBreakEngine::findBreaks( UText *text, 47 int32_t startPos, 48 int32_t endPos, 49 int32_t breakType, 50 UVector32 &foundBreaks ) const { 51 (void)startPos; // TODO: remove this param? 52 int32_t result = 0; 53 54 // Find the span of characters included in the set. 55 // The span to break begins at the current position in the text, and 56 // extends towards the start or end of the text, depending on 'reverse'. 57 58 int32_t start = (int32_t)utext_getNativeIndex(text); 59 int32_t current; 60 int32_t rangeStart; 61 int32_t rangeEnd; 62 UChar32 c = utext_current32(text); 63 while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) { 64 utext_next32(text); // TODO: recast loop for postincrement 65 c = utext_current32(text); 66 } 67 rangeStart = start; 68 rangeEnd = current; 69 if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) { 70 result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks); 71 utext_setNativeIndex(text, current); 72 } 73 74 return result; 75 } 76 77 void 78 DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) { 79 fSet = set; 80 // Compact for caching 81 fSet.compact(); 82 } 83 84 /* 85 ****************************************************************** 86 * PossibleWord 87 */ 88 89 // Helper class for improving readability of the Thai/Lao/Khmer word break 90 // algorithm. The implementation is completely inline. 91 92 // List size, limited by the maximum number of words in the dictionary 93 // that form a nested sequence. 94 static const int32_t POSSIBLE_WORD_LIST_MAX = 20; 95 96 class PossibleWord { 97 private: 98 // list of word candidate lengths, in increasing length order 99 // TODO: bytes would be sufficient for word lengths. 100 int32_t count; // Count of candidates 101 int32_t prefix; // The longest match with a dictionary word 102 int32_t offset; // Offset in the text of these candidates 103 int32_t mark; // The preferred candidate's offset 104 int32_t current; // The candidate we're currently looking at 105 int32_t cuLengths[POSSIBLE_WORD_LIST_MAX]; // Word Lengths, in code units. 106 int32_t cpLengths[POSSIBLE_WORD_LIST_MAX]; // Word Lengths, in code points. 107 108 public: 109 PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {}; 110 ~PossibleWord() {}; 111 112 // Fill the list of candidates if needed, select the longest, and return the number found 113 int32_t candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ); 114 115 // Select the currently marked candidate, point after it in the text, and invalidate self 116 int32_t acceptMarked( UText *text ); 117 118 // Back up from the current candidate to the next shorter one; return TRUE if that exists 119 // and point the text after it 120 UBool backUp( UText *text ); 121 122 // Return the longest prefix this candidate location shares with a dictionary word 123 // Return value is in code points. 124 int32_t longestPrefix() { return prefix; }; 125 126 // Mark the current candidate as the one we like 127 void markCurrent() { mark = current; }; 128 129 // Get length in code points of the marked word. 130 int32_t markedCPLength() { return cpLengths[mark]; }; 131 }; 132 133 134 int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) { 135 // TODO: If getIndex is too slow, use offset < 0 and add discardAll() 136 int32_t start = (int32_t)utext_getNativeIndex(text); 137 if (start != offset) { 138 offset = start; 139 count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix); 140 // Dictionary leaves text after longest prefix, not longest word. Back up. 141 if (count <= 0) { 142 utext_setNativeIndex(text, start); 143 } 144 } 145 if (count > 0) { 146 utext_setNativeIndex(text, start+cuLengths[count-1]); 147 } 148 current = count-1; 149 mark = current; 150 return count; 151 } 152 153 int32_t 154 PossibleWord::acceptMarked( UText *text ) { 155 utext_setNativeIndex(text, offset + cuLengths[mark]); 156 return cuLengths[mark]; 157 } 158 159 160 UBool 161 PossibleWord::backUp( UText *text ) { 162 if (current > 0) { 163 utext_setNativeIndex(text, offset + cuLengths[--current]); 164 return TRUE; 165 } 166 return FALSE; 167 } 168 169 /* 170 ****************************************************************** 171 * ThaiBreakEngine 172 */ 173 174 // How many words in a row are "good enough"? 175 static const int32_t THAI_LOOKAHEAD = 3; 176 177 // Will not combine a non-word with a preceding dictionary word longer than this 178 static const int32_t THAI_ROOT_COMBINE_THRESHOLD = 3; 179 180 // Will not combine a non-word that shares at least this much prefix with a 181 // dictionary word, with a preceding word 182 static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = 3; 183 184 // Ellision character 185 static const int32_t THAI_PAIYANNOI = 0x0E2F; 186 187 // Repeat character 188 static const int32_t THAI_MAIYAMOK = 0x0E46; 189 190 // Minimum word size 191 static const int32_t THAI_MIN_WORD = 2; 192 193 // Minimum number of characters for two words 194 static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2; 195 196 ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) 197 : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)), 198 fDictionary(adoptDictionary) 199 { 200 fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status); 201 if (U_SUCCESS(status)) { 202 setCharacters(fThaiWordSet); 203 } 204 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status); 205 fMarkSet.add(0x0020); 206 fEndWordSet = fThaiWordSet; 207 fEndWordSet.remove(0x0E31); // MAI HAN-AKAT 208 fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 209 fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK 210 fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 211 fSuffixSet.add(THAI_PAIYANNOI); 212 fSuffixSet.add(THAI_MAIYAMOK); 213 214 // Compact for caching. 215 fMarkSet.compact(); 216 fEndWordSet.compact(); 217 fBeginWordSet.compact(); 218 fSuffixSet.compact(); 219 } 220 221 ThaiBreakEngine::~ThaiBreakEngine() { 222 delete fDictionary; 223 } 224 225 int32_t 226 ThaiBreakEngine::divideUpDictionaryRange( UText *text, 227 int32_t rangeStart, 228 int32_t rangeEnd, 229 UVector32 &foundBreaks ) const { 230 utext_setNativeIndex(text, rangeStart); 231 utext_moveIndex32(text, THAI_MIN_WORD_SPAN); 232 if (utext_getNativeIndex(text) >= rangeEnd) { 233 return 0; // Not enough characters for two words 234 } 235 utext_setNativeIndex(text, rangeStart); 236 237 238 uint32_t wordsFound = 0; 239 int32_t cpWordLength = 0; // Word Length in Code Points. 240 int32_t cuWordLength = 0; // Word length in code units (UText native indexing) 241 int32_t current; 242 UErrorCode status = U_ZERO_ERROR; 243 PossibleWord words[THAI_LOOKAHEAD]; 244 245 utext_setNativeIndex(text, rangeStart); 246 247 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { 248 cpWordLength = 0; 249 cuWordLength = 0; 250 251 // Look for candidate words at the current position 252 int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 253 254 // If we found exactly one, use that 255 if (candidates == 1) { 256 cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text); 257 cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength(); 258 wordsFound += 1; 259 } 260 // If there was more than one, see which one can take us forward the most words 261 else if (candidates > 1) { 262 // If we're already at the end of the range, we're done 263 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 264 goto foundBest; 265 } 266 do { 267 int32_t wordsMatched = 1; 268 if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { 269 if (wordsMatched < 2) { 270 // Followed by another dictionary word; mark first word as a good candidate 271 words[wordsFound%THAI_LOOKAHEAD].markCurrent(); 272 wordsMatched = 2; 273 } 274 275 // If we're already at the end of the range, we're done 276 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 277 goto foundBest; 278 } 279 280 // See if any of the possible second words is followed by a third word 281 do { 282 // If we find a third word, stop right away 283 if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { 284 words[wordsFound % THAI_LOOKAHEAD].markCurrent(); 285 goto foundBest; 286 } 287 } 288 while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text)); 289 } 290 } 291 while (words[wordsFound % THAI_LOOKAHEAD].backUp(text)); 292 foundBest: 293 // Set UText position to after the accepted word. 294 cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text); 295 cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength(); 296 wordsFound += 1; 297 } 298 299 // We come here after having either found a word or not. We look ahead to the 300 // next word. If it's not a dictionary word, we will combine it with the word we 301 // just found (if there is one), but only if the preceding word does not exceed 302 // the threshold. 303 // The text iterator should now be positioned at the end of the word we found. 304 305 UChar32 uc = 0; 306 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) { 307 // if it is a dictionary word, do nothing. If it isn't, then if there is 308 // no preceding word, or the non-word shares less than the minimum threshold 309 // of characters with a dictionary word, then scan to resynchronize 310 if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 311 && (cuWordLength == 0 312 || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) { 313 // Look for a plausible word boundary 314 int32_t remaining = rangeEnd - (current+cuWordLength); 315 UChar32 pc; 316 int32_t chars = 0; 317 for (;;) { 318 int32_t pcIndex = (int32_t)utext_getNativeIndex(text); 319 pc = utext_next32(text); 320 int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex; 321 chars += pcSize; 322 remaining -= pcSize; 323 if (remaining <= 0) { 324 break; 325 } 326 uc = utext_current32(text); 327 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 328 // Maybe. See if it's in the dictionary. 329 // NOTE: In the original Apple code, checked that the next 330 // two characters after uc were not 0x0E4C THANTHAKHAT before 331 // checking the dictionary. That is just a performance filter, 332 // but it's not clear it's faster than checking the trie. 333 int32_t candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 334 utext_setNativeIndex(text, current + cuWordLength + chars); 335 if (candidates > 0) { 336 break; 337 } 338 } 339 } 340 341 // Bump the word count if there wasn't already one 342 if (cuWordLength <= 0) { 343 wordsFound += 1; 344 } 345 346 // Update the length with the passed-over characters 347 cuWordLength += chars; 348 } 349 else { 350 // Back up to where we were for next iteration 351 utext_setNativeIndex(text, current+cuWordLength); 352 } 353 } 354 355 // Never stop before a combining mark. 356 int32_t currPos; 357 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { 358 utext_next32(text); 359 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos; 360 } 361 362 // Look ahead for possible suffixes if a dictionary word does not follow. 363 // We do this in code rather than using a rule so that the heuristic 364 // resynch continues to function. For example, one of the suffix characters 365 // could be a typo in the middle of a word. 366 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cuWordLength > 0) { 367 if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 368 && fSuffixSet.contains(uc = utext_current32(text))) { 369 if (uc == THAI_PAIYANNOI) { 370 if (!fSuffixSet.contains(utext_previous32(text))) { 371 // Skip over previous end and PAIYANNOI 372 utext_next32(text); 373 int32_t paiyannoiIndex = (int32_t)utext_getNativeIndex(text); 374 utext_next32(text); 375 cuWordLength += (int32_t)utext_getNativeIndex(text) - paiyannoiIndex; // Add PAIYANNOI to word 376 uc = utext_current32(text); // Fetch next character 377 } 378 else { 379 // Restore prior position 380 utext_next32(text); 381 } 382 } 383 if (uc == THAI_MAIYAMOK) { 384 if (utext_previous32(text) != THAI_MAIYAMOK) { 385 // Skip over previous end and MAIYAMOK 386 utext_next32(text); 387 int32_t maiyamokIndex = (int32_t)utext_getNativeIndex(text); 388 utext_next32(text); 389 cuWordLength += (int32_t)utext_getNativeIndex(text) - maiyamokIndex; // Add MAIYAMOK to word 390 } 391 else { 392 // Restore prior position 393 utext_next32(text); 394 } 395 } 396 } 397 else { 398 utext_setNativeIndex(text, current+cuWordLength); 399 } 400 } 401 402 // Did we find a word on this iteration? If so, push it on the break stack 403 if (cuWordLength > 0) { 404 foundBreaks.push((current+cuWordLength), status); 405 } 406 } 407 408 // Don't return a break for the end of the dictionary range if there is one there. 409 if (foundBreaks.peeki() >= rangeEnd) { 410 (void) foundBreaks.popi(); 411 wordsFound -= 1; 412 } 413 414 return wordsFound; 415 } 416 417 /* 418 ****************************************************************** 419 * LaoBreakEngine 420 */ 421 422 // How many words in a row are "good enough"? 423 static const int32_t LAO_LOOKAHEAD = 3; 424 425 // Will not combine a non-word with a preceding dictionary word longer than this 426 static const int32_t LAO_ROOT_COMBINE_THRESHOLD = 3; 427 428 // Will not combine a non-word that shares at least this much prefix with a 429 // dictionary word, with a preceding word 430 static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = 3; 431 432 // Minimum word size 433 static const int32_t LAO_MIN_WORD = 2; 434 435 // Minimum number of characters for two words 436 static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2; 437 438 LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) 439 : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)), 440 fDictionary(adoptDictionary) 441 { 442 fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status); 443 if (U_SUCCESS(status)) { 444 setCharacters(fLaoWordSet); 445 } 446 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status); 447 fMarkSet.add(0x0020); 448 fEndWordSet = fLaoWordSet; 449 fEndWordSet.remove(0x0EC0, 0x0EC4); // prefix vowels 450 fBeginWordSet.add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters) 451 fBeginWordSet.add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent) 452 fBeginWordSet.add(0x0EC0, 0x0EC4); // prefix vowels 453 454 // Compact for caching. 455 fMarkSet.compact(); 456 fEndWordSet.compact(); 457 fBeginWordSet.compact(); 458 } 459 460 LaoBreakEngine::~LaoBreakEngine() { 461 delete fDictionary; 462 } 463 464 int32_t 465 LaoBreakEngine::divideUpDictionaryRange( UText *text, 466 int32_t rangeStart, 467 int32_t rangeEnd, 468 UVector32 &foundBreaks ) const { 469 if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) { 470 return 0; // Not enough characters for two words 471 } 472 473 uint32_t wordsFound = 0; 474 int32_t cpWordLength = 0; 475 int32_t cuWordLength = 0; 476 int32_t current; 477 UErrorCode status = U_ZERO_ERROR; 478 PossibleWord words[LAO_LOOKAHEAD]; 479 480 utext_setNativeIndex(text, rangeStart); 481 482 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { 483 cuWordLength = 0; 484 cpWordLength = 0; 485 486 // Look for candidate words at the current position 487 int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 488 489 // If we found exactly one, use that 490 if (candidates == 1) { 491 cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text); 492 cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength(); 493 wordsFound += 1; 494 } 495 // If there was more than one, see which one can take us forward the most words 496 else if (candidates > 1) { 497 // If we're already at the end of the range, we're done 498 if (utext_getNativeIndex(text) >= rangeEnd) { 499 goto foundBest; 500 } 501 do { 502 int32_t wordsMatched = 1; 503 if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { 504 if (wordsMatched < 2) { 505 // Followed by another dictionary word; mark first word as a good candidate 506 words[wordsFound%LAO_LOOKAHEAD].markCurrent(); 507 wordsMatched = 2; 508 } 509 510 // If we're already at the end of the range, we're done 511 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 512 goto foundBest; 513 } 514 515 // See if any of the possible second words is followed by a third word 516 do { 517 // If we find a third word, stop right away 518 if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { 519 words[wordsFound % LAO_LOOKAHEAD].markCurrent(); 520 goto foundBest; 521 } 522 } 523 while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text)); 524 } 525 } 526 while (words[wordsFound % LAO_LOOKAHEAD].backUp(text)); 527 foundBest: 528 cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text); 529 cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength(); 530 wordsFound += 1; 531 } 532 533 // We come here after having either found a word or not. We look ahead to the 534 // next word. If it's not a dictionary word, we will combine it withe the word we 535 // just found (if there is one), but only if the preceding word does not exceed 536 // the threshold. 537 // The text iterator should now be positioned at the end of the word we found. 538 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < LAO_ROOT_COMBINE_THRESHOLD) { 539 // if it is a dictionary word, do nothing. If it isn't, then if there is 540 // no preceding word, or the non-word shares less than the minimum threshold 541 // of characters with a dictionary word, then scan to resynchronize 542 if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 543 && (cuWordLength == 0 544 || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) { 545 // Look for a plausible word boundary 546 int32_t remaining = rangeEnd - (current + cuWordLength); 547 UChar32 pc; 548 UChar32 uc; 549 int32_t chars = 0; 550 for (;;) { 551 int32_t pcIndex = (int32_t)utext_getNativeIndex(text); 552 pc = utext_next32(text); 553 int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex; 554 chars += pcSize; 555 remaining -= pcSize; 556 if (remaining <= 0) { 557 break; 558 } 559 uc = utext_current32(text); 560 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 561 // Maybe. See if it's in the dictionary. 562 // TODO: this looks iffy; compare with old code. 563 int32_t candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 564 utext_setNativeIndex(text, current + cuWordLength + chars); 565 if (candidates > 0) { 566 break; 567 } 568 } 569 } 570 571 // Bump the word count if there wasn't already one 572 if (cuWordLength <= 0) { 573 wordsFound += 1; 574 } 575 576 // Update the length with the passed-over characters 577 cuWordLength += chars; 578 } 579 else { 580 // Back up to where we were for next iteration 581 utext_setNativeIndex(text, current + cuWordLength); 582 } 583 } 584 585 // Never stop before a combining mark. 586 int32_t currPos; 587 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { 588 utext_next32(text); 589 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos; 590 } 591 592 // Look ahead for possible suffixes if a dictionary word does not follow. 593 // We do this in code rather than using a rule so that the heuristic 594 // resynch continues to function. For example, one of the suffix characters 595 // could be a typo in the middle of a word. 596 // NOT CURRENTLY APPLICABLE TO LAO 597 598 // Did we find a word on this iteration? If so, push it on the break stack 599 if (cuWordLength > 0) { 600 foundBreaks.push((current+cuWordLength), status); 601 } 602 } 603 604 // Don't return a break for the end of the dictionary range if there is one there. 605 if (foundBreaks.peeki() >= rangeEnd) { 606 (void) foundBreaks.popi(); 607 wordsFound -= 1; 608 } 609 610 return wordsFound; 611 } 612 613 /* 614 ****************************************************************** 615 * BurmeseBreakEngine 616 */ 617 618 // How many words in a row are "good enough"? 619 static const int32_t BURMESE_LOOKAHEAD = 3; 620 621 // Will not combine a non-word with a preceding dictionary word longer than this 622 static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3; 623 624 // Will not combine a non-word that shares at least this much prefix with a 625 // dictionary word, with a preceding word 626 static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3; 627 628 // Minimum word size 629 static const int32_t BURMESE_MIN_WORD = 2; 630 631 // Minimum number of characters for two words 632 static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2; 633 634 BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) 635 : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)), 636 fDictionary(adoptDictionary) 637 { 638 fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status); 639 if (U_SUCCESS(status)) { 640 setCharacters(fBurmeseWordSet); 641 } 642 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status); 643 fMarkSet.add(0x0020); 644 fEndWordSet = fBurmeseWordSet; 645 fBeginWordSet.add(0x1000, 0x102A); // basic consonants and independent vowels 646 647 // Compact for caching. 648 fMarkSet.compact(); 649 fEndWordSet.compact(); 650 fBeginWordSet.compact(); 651 } 652 653 BurmeseBreakEngine::~BurmeseBreakEngine() { 654 delete fDictionary; 655 } 656 657 int32_t 658 BurmeseBreakEngine::divideUpDictionaryRange( UText *text, 659 int32_t rangeStart, 660 int32_t rangeEnd, 661 UVector32 &foundBreaks ) const { 662 if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) { 663 return 0; // Not enough characters for two words 664 } 665 666 uint32_t wordsFound = 0; 667 int32_t cpWordLength = 0; 668 int32_t cuWordLength = 0; 669 int32_t current; 670 UErrorCode status = U_ZERO_ERROR; 671 PossibleWord words[BURMESE_LOOKAHEAD]; 672 673 utext_setNativeIndex(text, rangeStart); 674 675 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { 676 cuWordLength = 0; 677 cpWordLength = 0; 678 679 // Look for candidate words at the current position 680 int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 681 682 // If we found exactly one, use that 683 if (candidates == 1) { 684 cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text); 685 cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength(); 686 wordsFound += 1; 687 } 688 // If there was more than one, see which one can take us forward the most words 689 else if (candidates > 1) { 690 // If we're already at the end of the range, we're done 691 if (utext_getNativeIndex(text) >= rangeEnd) { 692 goto foundBest; 693 } 694 do { 695 int32_t wordsMatched = 1; 696 if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { 697 if (wordsMatched < 2) { 698 // Followed by another dictionary word; mark first word as a good candidate 699 words[wordsFound%BURMESE_LOOKAHEAD].markCurrent(); 700 wordsMatched = 2; 701 } 702 703 // If we're already at the end of the range, we're done 704 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 705 goto foundBest; 706 } 707 708 // See if any of the possible second words is followed by a third word 709 do { 710 // If we find a third word, stop right away 711 if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { 712 words[wordsFound % BURMESE_LOOKAHEAD].markCurrent(); 713 goto foundBest; 714 } 715 } 716 while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(text)); 717 } 718 } 719 while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text)); 720 foundBest: 721 cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text); 722 cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength(); 723 wordsFound += 1; 724 } 725 726 // We come here after having either found a word or not. We look ahead to the 727 // next word. If it's not a dictionary word, we will combine it withe the word we 728 // just found (if there is one), but only if the preceding word does not exceed 729 // the threshold. 730 // The text iterator should now be positioned at the end of the word we found. 731 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) { 732 // if it is a dictionary word, do nothing. If it isn't, then if there is 733 // no preceding word, or the non-word shares less than the minimum threshold 734 // of characters with a dictionary word, then scan to resynchronize 735 if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 736 && (cuWordLength == 0 737 || words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) { 738 // Look for a plausible word boundary 739 int32_t remaining = rangeEnd - (current + cuWordLength); 740 UChar32 pc; 741 UChar32 uc; 742 int32_t chars = 0; 743 for (;;) { 744 int32_t pcIndex = (int32_t)utext_getNativeIndex(text); 745 pc = utext_next32(text); 746 int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex; 747 chars += pcSize; 748 remaining -= pcSize; 749 if (remaining <= 0) { 750 break; 751 } 752 uc = utext_current32(text); 753 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 754 // Maybe. See if it's in the dictionary. 755 // TODO: this looks iffy; compare with old code. 756 int32_t candidates = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 757 utext_setNativeIndex(text, current + cuWordLength + chars); 758 if (candidates > 0) { 759 break; 760 } 761 } 762 } 763 764 // Bump the word count if there wasn't already one 765 if (cuWordLength <= 0) { 766 wordsFound += 1; 767 } 768 769 // Update the length with the passed-over characters 770 cuWordLength += chars; 771 } 772 else { 773 // Back up to where we were for next iteration 774 utext_setNativeIndex(text, current + cuWordLength); 775 } 776 } 777 778 // Never stop before a combining mark. 779 int32_t currPos; 780 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { 781 utext_next32(text); 782 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos; 783 } 784 785 // Look ahead for possible suffixes if a dictionary word does not follow. 786 // We do this in code rather than using a rule so that the heuristic 787 // resynch continues to function. For example, one of the suffix characters 788 // could be a typo in the middle of a word. 789 // NOT CURRENTLY APPLICABLE TO BURMESE 790 791 // Did we find a word on this iteration? If so, push it on the break stack 792 if (cuWordLength > 0) { 793 foundBreaks.push((current+cuWordLength), status); 794 } 795 } 796 797 // Don't return a break for the end of the dictionary range if there is one there. 798 if (foundBreaks.peeki() >= rangeEnd) { 799 (void) foundBreaks.popi(); 800 wordsFound -= 1; 801 } 802 803 return wordsFound; 804 } 805 806 /* 807 ****************************************************************** 808 * KhmerBreakEngine 809 */ 810 811 // How many words in a row are "good enough"? 812 static const int32_t KHMER_LOOKAHEAD = 3; 813 814 // Will not combine a non-word with a preceding dictionary word longer than this 815 static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3; 816 817 // Will not combine a non-word that shares at least this much prefix with a 818 // dictionary word, with a preceding word 819 static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3; 820 821 // Minimum word size 822 static const int32_t KHMER_MIN_WORD = 2; 823 824 // Minimum number of characters for two words 825 static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2; 826 827 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) 828 : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)), 829 fDictionary(adoptDictionary) 830 { 831 fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status); 832 if (U_SUCCESS(status)) { 833 setCharacters(fKhmerWordSet); 834 } 835 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status); 836 fMarkSet.add(0x0020); 837 fEndWordSet = fKhmerWordSet; 838 fBeginWordSet.add(0x1780, 0x17B3); 839 //fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels 840 //fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word 841 //fEndWordSet.remove(0x17B2); // Khmer independent vowel that can't end a word 842 fEndWordSet.remove(0x17D2); // KHMER SIGN COENG that combines some following characters 843 //fEndWordSet.remove(0x17B6, 0x17C5); // Remove dependent vowels 844 // fEndWordSet.remove(0x0E31); // MAI HAN-AKAT 845 // fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 846 // fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK 847 // fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 848 // fSuffixSet.add(THAI_PAIYANNOI); 849 // fSuffixSet.add(THAI_MAIYAMOK); 850 851 // Compact for caching. 852 fMarkSet.compact(); 853 fEndWordSet.compact(); 854 fBeginWordSet.compact(); 855 // fSuffixSet.compact(); 856 } 857 858 KhmerBreakEngine::~KhmerBreakEngine() { 859 delete fDictionary; 860 } 861 862 int32_t 863 KhmerBreakEngine::divideUpDictionaryRange( UText *text, 864 int32_t rangeStart, 865 int32_t rangeEnd, 866 UVector32 &foundBreaks ) const { 867 if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) { 868 return 0; // Not enough characters for two words 869 } 870 871 uint32_t wordsFound = 0; 872 int32_t cpWordLength = 0; 873 int32_t cuWordLength = 0; 874 int32_t current; 875 UErrorCode status = U_ZERO_ERROR; 876 PossibleWord words[KHMER_LOOKAHEAD]; 877 878 utext_setNativeIndex(text, rangeStart); 879 880 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { 881 cuWordLength = 0; 882 cpWordLength = 0; 883 884 // Look for candidate words at the current position 885 int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 886 887 // If we found exactly one, use that 888 if (candidates == 1) { 889 cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text); 890 cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength(); 891 wordsFound += 1; 892 } 893 894 // If there was more than one, see which one can take us forward the most words 895 else if (candidates > 1) { 896 // If we're already at the end of the range, we're done 897 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 898 goto foundBest; 899 } 900 do { 901 int32_t wordsMatched = 1; 902 if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { 903 if (wordsMatched < 2) { 904 // Followed by another dictionary word; mark first word as a good candidate 905 words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); 906 wordsMatched = 2; 907 } 908 909 // If we're already at the end of the range, we're done 910 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 911 goto foundBest; 912 } 913 914 // See if any of the possible second words is followed by a third word 915 do { 916 // If we find a third word, stop right away 917 if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { 918 words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); 919 goto foundBest; 920 } 921 } 922 while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text)); 923 } 924 } 925 while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text)); 926 foundBest: 927 cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text); 928 cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength(); 929 wordsFound += 1; 930 } 931 932 // We come here after having either found a word or not. We look ahead to the 933 // next word. If it's not a dictionary word, we will combine it with the word we 934 // just found (if there is one), but only if the preceding word does not exceed 935 // the threshold. 936 // The text iterator should now be positioned at the end of the word we found. 937 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) { 938 // if it is a dictionary word, do nothing. If it isn't, then if there is 939 // no preceding word, or the non-word shares less than the minimum threshold 940 // of characters with a dictionary word, then scan to resynchronize 941 if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 942 && (cuWordLength == 0 943 || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) { 944 // Look for a plausible word boundary 945 int32_t remaining = rangeEnd - (current+cuWordLength); 946 UChar32 pc; 947 UChar32 uc; 948 int32_t chars = 0; 949 for (;;) { 950 int32_t pcIndex = (int32_t)utext_getNativeIndex(text); 951 pc = utext_next32(text); 952 int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex; 953 chars += pcSize; 954 remaining -= pcSize; 955 if (remaining <= 0) { 956 break; 957 } 958 uc = utext_current32(text); 959 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 960 // Maybe. See if it's in the dictionary. 961 int32_t candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 962 utext_setNativeIndex(text, current+cuWordLength+chars); 963 if (candidates > 0) { 964 break; 965 } 966 } 967 } 968 969 // Bump the word count if there wasn't already one 970 if (cuWordLength <= 0) { 971 wordsFound += 1; 972 } 973 974 // Update the length with the passed-over characters 975 cuWordLength += chars; 976 } 977 else { 978 // Back up to where we were for next iteration 979 utext_setNativeIndex(text, current+cuWordLength); 980 } 981 } 982 983 // Never stop before a combining mark. 984 int32_t currPos; 985 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { 986 utext_next32(text); 987 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos; 988 } 989 990 // Look ahead for possible suffixes if a dictionary word does not follow. 991 // We do this in code rather than using a rule so that the heuristic 992 // resynch continues to function. For example, one of the suffix characters 993 // could be a typo in the middle of a word. 994 // if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { 995 // if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 996 // && fSuffixSet.contains(uc = utext_current32(text))) { 997 // if (uc == KHMER_PAIYANNOI) { 998 // if (!fSuffixSet.contains(utext_previous32(text))) { 999 // // Skip over previous end and PAIYANNOI 1000 // utext_next32(text); 1001 // utext_next32(text); 1002 // wordLength += 1; // Add PAIYANNOI to word 1003 // uc = utext_current32(text); // Fetch next character 1004 // } 1005 // else { 1006 // // Restore prior position 1007 // utext_next32(text); 1008 // } 1009 // } 1010 // if (uc == KHMER_MAIYAMOK) { 1011 // if (utext_previous32(text) != KHMER_MAIYAMOK) { 1012 // // Skip over previous end and MAIYAMOK 1013 // utext_next32(text); 1014 // utext_next32(text); 1015 // wordLength += 1; // Add MAIYAMOK to word 1016 // } 1017 // else { 1018 // // Restore prior position 1019 // utext_next32(text); 1020 // } 1021 // } 1022 // } 1023 // else { 1024 // utext_setNativeIndex(text, current+wordLength); 1025 // } 1026 // } 1027 1028 // Did we find a word on this iteration? If so, push it on the break stack 1029 if (cuWordLength > 0) { 1030 foundBreaks.push((current+cuWordLength), status); 1031 } 1032 } 1033 1034 // Don't return a break for the end of the dictionary range if there is one there. 1035 if (foundBreaks.peeki() >= rangeEnd) { 1036 (void) foundBreaks.popi(); 1037 wordsFound -= 1; 1038 } 1039 1040 return wordsFound; 1041 } 1042 1043 #if !UCONFIG_NO_NORMALIZATION 1044 /* 1045 ****************************************************************** 1046 * CjkBreakEngine 1047 */ 1048 static const uint32_t kuint32max = 0xFFFFFFFF; 1049 CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status) 1050 : DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) { 1051 // Korean dictionary only includes Hangul syllables 1052 fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status); 1053 fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status); 1054 fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status); 1055 fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status); 1056 nfkcNorm2 = Normalizer2::getNFKCInstance(status); 1057 1058 if (U_SUCCESS(status)) { 1059 // handle Korean and Japanese/Chinese using different dictionaries 1060 if (type == kKorean) { 1061 setCharacters(fHangulWordSet); 1062 } else { //Chinese and Japanese 1063 UnicodeSet cjSet; 1064 cjSet.addAll(fHanWordSet); 1065 cjSet.addAll(fKatakanaWordSet); 1066 cjSet.addAll(fHiraganaWordSet); 1067 cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK 1068 cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK 1069 setCharacters(cjSet); 1070 } 1071 } 1072 } 1073 1074 CjkBreakEngine::~CjkBreakEngine(){ 1075 delete fDictionary; 1076 } 1077 1078 // The katakanaCost values below are based on the length frequencies of all 1079 // katakana phrases in the dictionary 1080 static const int32_t kMaxKatakanaLength = 8; 1081 static const int32_t kMaxKatakanaGroupLength = 20; 1082 static const uint32_t maxSnlp = 255; 1083 1084 static inline uint32_t getKatakanaCost(int32_t wordLength){ 1085 //TODO: fill array with actual values from dictionary! 1086 static const uint32_t katakanaCost[kMaxKatakanaLength + 1] 1087 = {8192, 984, 408, 240, 204, 252, 300, 372, 480}; 1088 return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength]; 1089 } 1090 1091 static inline bool isKatakana(UChar32 value) { 1092 return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) || 1093 (value >= 0xFF66 && value <= 0xFF9f); 1094 } 1095 1096 1097 // Function for accessing internal utext flags. 1098 // Replicates an internal UText function. 1099 1100 static inline int32_t utext_i32_flag(int32_t bitIndex) { 1101 return (int32_t)1 << bitIndex; 1102 } 1103 1104 1105 /* 1106 * @param text A UText representing the text 1107 * @param rangeStart The start of the range of dictionary characters 1108 * @param rangeEnd The end of the range of dictionary characters 1109 * @param foundBreaks vector<int32> to receive the break positions 1110 * @return The number of breaks found 1111 */ 1112 int32_t 1113 CjkBreakEngine::divideUpDictionaryRange( UText *inText, 1114 int32_t rangeStart, 1115 int32_t rangeEnd, 1116 UVector32 &foundBreaks ) const { 1117 if (rangeStart >= rangeEnd) { 1118 return 0; 1119 } 1120 1121 // UnicodeString version of input UText, NFKC normalized if necessary. 1122 UnicodeString inString; 1123 1124 // inputMap[inStringIndex] = corresponding native index from UText inText. 1125 // If NULL then mapping is 1:1 1126 LocalPointer<UVector32> inputMap; 1127 1128 UErrorCode status = U_ZERO_ERROR; 1129 1130 1131 // if UText has the input string as one contiguous UTF-16 chunk 1132 if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS)) && 1133 inText->chunkNativeStart <= rangeStart && 1134 inText->chunkNativeLimit >= rangeEnd && 1135 inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) { 1136 1137 // Input UText is in one contiguous UTF-16 chunk. 1138 // Use Read-only aliasing UnicodeString. 1139 inString.setTo(FALSE, 1140 inText->chunkContents + rangeStart - inText->chunkNativeStart, 1141 rangeEnd - rangeStart); 1142 } else { 1143 // Copy the text from the original inText (UText) to inString (UnicodeString). 1144 // Create a map from UnicodeString indices -> UText offsets. 1145 utext_setNativeIndex(inText, rangeStart); 1146 int32_t limit = rangeEnd; 1147 U_ASSERT(limit <= utext_nativeLength(inText)); 1148 if (limit > utext_nativeLength(inText)) { 1149 limit = (int32_t)utext_nativeLength(inText); 1150 } 1151 inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status); 1152 if (U_FAILURE(status)) { 1153 return 0; 1154 } 1155 while (utext_getNativeIndex(inText) < limit) { 1156 int32_t nativePosition = (int32_t)utext_getNativeIndex(inText); 1157 UChar32 c = utext_next32(inText); 1158 U_ASSERT(c != U_SENTINEL); 1159 inString.append(c); 1160 while (inputMap->size() < inString.length()) { 1161 inputMap->addElement(nativePosition, status); 1162 } 1163 } 1164 inputMap->addElement(limit, status); 1165 } 1166 1167 1168 if (!nfkcNorm2->isNormalized(inString, status)) { 1169 UnicodeString normalizedInput; 1170 // normalizedMap[normalizedInput position] == original UText position. 1171 LocalPointer<UVector32> normalizedMap(new UVector32(status), status); 1172 if (U_FAILURE(status)) { 1173 return 0; 1174 } 1175 1176 UnicodeString fragment; 1177 UnicodeString normalizedFragment; 1178 for (int32_t srcI = 0; srcI < inString.length();) { // Once per normalization chunk 1179 fragment.remove(); 1180 int32_t fragmentStartI = srcI; 1181 UChar32 c = inString.char32At(srcI); 1182 for (;;) { 1183 fragment.append(c); 1184 srcI = inString.moveIndex32(srcI, 1); 1185 if (srcI == inString.length()) { 1186 break; 1187 } 1188 c = inString.char32At(srcI); 1189 if (nfkcNorm2->hasBoundaryBefore(c)) { 1190 break; 1191 } 1192 } 1193 nfkcNorm2->normalize(fragment, normalizedFragment, status); 1194 normalizedInput.append(normalizedFragment); 1195 1196 // Map every position in the normalized chunk to the start of the chunk 1197 // in the original input. 1198 int32_t fragmentOriginalStart = inputMap.isValid() ? 1199 inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart; 1200 while (normalizedMap->size() < normalizedInput.length()) { 1201 normalizedMap->addElement(fragmentOriginalStart, status); 1202 if (U_FAILURE(status)) { 1203 break; 1204 } 1205 } 1206 } 1207 U_ASSERT(normalizedMap->size() == normalizedInput.length()); 1208 int32_t nativeEnd = inputMap.isValid() ? 1209 inputMap->elementAti(inString.length()) : inString.length()+rangeStart; 1210 normalizedMap->addElement(nativeEnd, status); 1211 1212 inputMap.moveFrom(normalizedMap); 1213 inString.moveFrom(normalizedInput); 1214 } 1215 1216 int32_t numCodePts = inString.countChar32(); 1217 if (numCodePts != inString.length()) { 1218 // There are supplementary characters in the input. 1219 // The dictionary will produce boundary positions in terms of code point indexes, 1220 // not in terms of code unit string indexes. 1221 // Use the inputMap mechanism to take care of this in addition to indexing differences 1222 // from normalization and/or UTF-8 input. 1223 UBool hadExistingMap = inputMap.isValid(); 1224 if (!hadExistingMap) { 1225 inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status); 1226 if (U_FAILURE(status)) { 1227 return 0; 1228 } 1229 } 1230 int32_t cpIdx = 0; 1231 for (int32_t cuIdx = 0; ; cuIdx = inString.moveIndex32(cuIdx, 1)) { 1232 U_ASSERT(cuIdx >= cpIdx); 1233 if (hadExistingMap) { 1234 inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx); 1235 } else { 1236 inputMap->addElement(cuIdx+rangeStart, status); 1237 } 1238 cpIdx++; 1239 if (cuIdx == inString.length()) { 1240 break; 1241 } 1242 } 1243 } 1244 1245 // bestSnlp[i] is the snlp of the best segmentation of the first i 1246 // code points in the range to be matched. 1247 UVector32 bestSnlp(numCodePts + 1, status); 1248 bestSnlp.addElement(0, status); 1249 for(int32_t i = 1; i <= numCodePts; i++) { 1250 bestSnlp.addElement(kuint32max, status); 1251 } 1252 1253 1254 // prev[i] is the index of the last CJK code point in the previous word in 1255 // the best segmentation of the first i characters. 1256 UVector32 prev(numCodePts + 1, status); 1257 for(int32_t i = 0; i <= numCodePts; i++){ 1258 prev.addElement(-1, status); 1259 } 1260 1261 const int32_t maxWordSize = 20; 1262 UVector32 values(numCodePts, status); 1263 values.setSize(numCodePts); 1264 UVector32 lengths(numCodePts, status); 1265 lengths.setSize(numCodePts); 1266 1267 UText fu = UTEXT_INITIALIZER; 1268 utext_openUnicodeString(&fu, &inString, &status); 1269 1270 // Dynamic programming to find the best segmentation. 1271 1272 // In outer loop, i is the code point index, 1273 // ix is the corresponding string (code unit) index. 1274 // They differ when the string contains supplementary characters. 1275 int32_t ix = 0; 1276 bool is_prev_katakana = false; 1277 for (int32_t i = 0; i < numCodePts; ++i, ix = inString.moveIndex32(ix, 1)) { 1278 if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) { 1279 continue; 1280 } 1281 1282 int32_t count; 1283 utext_setNativeIndex(&fu, ix); 1284 count = fDictionary->matches(&fu, maxWordSize, numCodePts, 1285 NULL, lengths.getBuffer(), values.getBuffer(), NULL); 1286 // Note: lengths is filled with code point lengths 1287 // The NULL parameter is the ignored code unit lengths. 1288 1289 // if there are no single character matches found in the dictionary 1290 // starting with this character, treat character as a 1-character word 1291 // with the highest value possible, i.e. the least likely to occur. 1292 // Exclude Korean characters from this treatment, as they should be left 1293 // together by default. 1294 if ((count == 0 || lengths.elementAti(0) != 1) && 1295 !fHangulWordSet.contains(inString.char32At(ix))) { 1296 values.setElementAt(maxSnlp, count); // 255 1297 lengths.setElementAt(1, count++); 1298 } 1299 1300 for (int32_t j = 0; j < count; j++) { 1301 uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)values.elementAti(j); 1302 int32_t ln_j_i = lengths.elementAti(j) + i; 1303 if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) { 1304 bestSnlp.setElementAt(newSnlp, ln_j_i); 1305 prev.setElementAt(i, ln_j_i); 1306 } 1307 } 1308 1309 // In Japanese, 1310 // Katakana word in single character is pretty rare. So we apply 1311 // the following heuristic to Katakana: any continuous run of Katakana 1312 // characters is considered a candidate word with a default cost 1313 // specified in the katakanaCost table according to its length. 1314 1315 bool is_katakana = isKatakana(inString.char32At(ix)); 1316 int32_t katakanaRunLength = 1; 1317 if (!is_prev_katakana && is_katakana) { 1318 int32_t j = inString.moveIndex32(ix, 1); 1319 // Find the end of the continuous run of Katakana characters 1320 while (j < inString.length() && katakanaRunLength < kMaxKatakanaGroupLength && 1321 isKatakana(inString.char32At(j))) { 1322 j = inString.moveIndex32(j, 1); 1323 katakanaRunLength++; 1324 } 1325 if (katakanaRunLength < kMaxKatakanaGroupLength) { 1326 uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength); 1327 if (newSnlp < (uint32_t)bestSnlp.elementAti(j)) { 1328 bestSnlp.setElementAt(newSnlp, j); 1329 prev.setElementAt(i, i+katakanaRunLength); // prev[j] = i; 1330 } 1331 } 1332 } 1333 is_prev_katakana = is_katakana; 1334 } 1335 utext_close(&fu); 1336 1337 // Start pushing the optimal offset index into t_boundary (t for tentative). 1338 // prev[numCodePts] is guaranteed to be meaningful. 1339 // We'll first push in the reverse order, i.e., 1340 // t_boundary[0] = numCodePts, and afterwards do a swap. 1341 UVector32 t_boundary(numCodePts+1, status); 1342 1343 int32_t numBreaks = 0; 1344 // No segmentation found, set boundary to end of range 1345 if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) { 1346 t_boundary.addElement(numCodePts, status); 1347 numBreaks++; 1348 } else { 1349 for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) { 1350 t_boundary.addElement(i, status); 1351 numBreaks++; 1352 } 1353 U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0); 1354 } 1355 1356 // Add a break for the start of the dictionary range if there is not one 1357 // there already. 1358 if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) { 1359 t_boundary.addElement(0, status); 1360 numBreaks++; 1361 } 1362 1363 // Now that we're done, convert positions in t_boundary[] (indices in 1364 // the normalized input string) back to indices in the original input UText 1365 // while reversing t_boundary and pushing values to foundBreaks. 1366 int32_t prevCPPos = -1; 1367 int32_t prevUTextPos = -1; 1368 for (int32_t i = numBreaks-1; i >= 0; i--) { 1369 int32_t cpPos = t_boundary.elementAti(i); 1370 U_ASSERT(cpPos > prevCPPos); 1371 int32_t utextPos = inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart; 1372 U_ASSERT(utextPos >= prevUTextPos); 1373 if (utextPos > prevUTextPos) { 1374 // Boundaries are added to foundBreaks output in ascending order. 1375 U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos); 1376 foundBreaks.push(utextPos, status); 1377 } else { 1378 // Normalization expanded the input text, the dictionary found a boundary 1379 // within the expansion, giving two boundaries with the same index in the 1380 // original text. Ignore the second. See ticket #12918. 1381 --numBreaks; 1382 } 1383 prevCPPos = cpPos; 1384 prevUTextPos = utextPos; 1385 } 1386 (void)prevCPPos; // suppress compiler warnings about unused variable 1387 1388 // inString goes out of scope 1389 // inputMap goes out of scope 1390 return numBreaks; 1391 } 1392 #endif 1393 1394 U_NAMESPACE_END 1395 1396 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 1397 1398