1 /** 2 ******************************************************************************* 3 * Copyright (C) 2006-2015, International Business Machines Corporation 4 * and others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7 8 #include "unicode/utypes.h" 9 10 #if !UCONFIG_NO_BREAK_ITERATION 11 12 #include "brkeng.h" 13 #include "dictbe.h" 14 #include "unicode/uniset.h" 15 #include "unicode/chariter.h" 16 #include "unicode/ubrk.h" 17 #include "uvectr32.h" 18 #include "uvector.h" 19 #include "uassert.h" 20 #include "unicode/normlzr.h" 21 #include "cmemory.h" 22 #include "dictionarydata.h" 23 24 U_NAMESPACE_BEGIN 25 26 /* 27 ****************************************************************** 28 */ 29 30 DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) { 31 fTypes = breakTypes; 32 } 33 34 DictionaryBreakEngine::~DictionaryBreakEngine() { 35 } 36 37 UBool 38 DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const { 39 return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes) 40 && fSet.contains(c)); 41 } 42 43 int32_t 44 DictionaryBreakEngine::findBreaks( UText *text, 45 int32_t startPos, 46 int32_t endPos, 47 UBool reverse, 48 int32_t breakType, 49 UStack &foundBreaks ) const { 50 int32_t result = 0; 51 52 // Find the span of characters included in the set. 53 // The span to break begins at the current position in the text, and 54 // extends towards the start or end of the text, depending on 'reverse'. 55 56 int32_t start = (int32_t)utext_getNativeIndex(text); 57 int32_t current; 58 int32_t rangeStart; 59 int32_t rangeEnd; 60 UChar32 c = utext_current32(text); 61 if (reverse) { 62 UBool isDict = fSet.contains(c); 63 while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) { 64 c = utext_previous32(text); 65 isDict = fSet.contains(c); 66 } 67 if (current < startPos) { 68 rangeStart = startPos; 69 } else { 70 rangeStart = current; 71 if (!isDict) { 72 utext_next32(text); 73 rangeStart = utext_getNativeIndex(text); 74 } 75 } 76 // rangeEnd = start + 1; 77 utext_setNativeIndex(text, start); 78 utext_next32(text); 79 rangeEnd = utext_getNativeIndex(text); 80 } 81 else { 82 while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) { 83 utext_next32(text); // TODO: recast loop for postincrement 84 c = utext_current32(text); 85 } 86 rangeStart = start; 87 rangeEnd = current; 88 } 89 if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) { 90 result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks); 91 utext_setNativeIndex(text, current); 92 } 93 94 return result; 95 } 96 97 void 98 DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) { 99 fSet = set; 100 // Compact for caching 101 fSet.compact(); 102 } 103 104 /* 105 ****************************************************************** 106 * PossibleWord 107 */ 108 109 // Helper class for improving readability of the Thai/Lao/Khmer word break 110 // algorithm. The implementation is completely inline. 111 112 // List size, limited by the maximum number of words in the dictionary 113 // that form a nested sequence. 114 static const int32_t POSSIBLE_WORD_LIST_MAX = 20; 115 116 class PossibleWord { 117 private: 118 // list of word candidate lengths, in increasing length order 119 // TODO: bytes would be sufficient for word lengths. 120 int32_t count; // Count of candidates 121 int32_t prefix; // The longest match with a dictionary word 122 int32_t offset; // Offset in the text of these candidates 123 int32_t mark; // The preferred candidate's offset 124 int32_t current; // The candidate we're currently looking at 125 int32_t cuLengths[POSSIBLE_WORD_LIST_MAX]; // Word Lengths, in code units. 126 int32_t cpLengths[POSSIBLE_WORD_LIST_MAX]; // Word Lengths, in code points. 127 128 public: 129 PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {}; 130 ~PossibleWord() {}; 131 132 // Fill the list of candidates if needed, select the longest, and return the number found 133 int32_t candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ); 134 135 // Select the currently marked candidate, point after it in the text, and invalidate self 136 int32_t acceptMarked( UText *text ); 137 138 // Back up from the current candidate to the next shorter one; return TRUE if that exists 139 // and point the text after it 140 UBool backUp( UText *text ); 141 142 // Return the longest prefix this candidate location shares with a dictionary word 143 // Return value is in code points. 144 int32_t longestPrefix() { return prefix; }; 145 146 // Mark the current candidate as the one we like 147 void markCurrent() { mark = current; }; 148 149 // Get length in code points of the marked word. 150 int32_t markedCPLength() { return cpLengths[mark]; }; 151 }; 152 153 154 int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) { 155 // TODO: If getIndex is too slow, use offset < 0 and add discardAll() 156 int32_t start = (int32_t)utext_getNativeIndex(text); 157 if (start != offset) { 158 offset = start; 159 count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix); 160 // Dictionary leaves text after longest prefix, not longest word. Back up. 161 if (count <= 0) { 162 utext_setNativeIndex(text, start); 163 } 164 } 165 if (count > 0) { 166 utext_setNativeIndex(text, start+cuLengths[count-1]); 167 } 168 current = count-1; 169 mark = current; 170 return count; 171 } 172 173 int32_t 174 PossibleWord::acceptMarked( UText *text ) { 175 utext_setNativeIndex(text, offset + cuLengths[mark]); 176 return cuLengths[mark]; 177 } 178 179 180 UBool 181 PossibleWord::backUp( UText *text ) { 182 if (current > 0) { 183 utext_setNativeIndex(text, offset + cuLengths[--current]); 184 return TRUE; 185 } 186 return FALSE; 187 } 188 189 /* 190 ****************************************************************** 191 * ThaiBreakEngine 192 */ 193 194 // How many words in a row are "good enough"? 195 static const int32_t THAI_LOOKAHEAD = 3; 196 197 // Will not combine a non-word with a preceding dictionary word longer than this 198 static const int32_t THAI_ROOT_COMBINE_THRESHOLD = 3; 199 200 // Will not combine a non-word that shares at least this much prefix with a 201 // dictionary word, with a preceding word 202 static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = 3; 203 204 // Ellision character 205 static const int32_t THAI_PAIYANNOI = 0x0E2F; 206 207 // Repeat character 208 static const int32_t THAI_MAIYAMOK = 0x0E46; 209 210 // Minimum word size 211 static const int32_t THAI_MIN_WORD = 2; 212 213 // Minimum number of characters for two words 214 static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2; 215 216 ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) 217 : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)), 218 fDictionary(adoptDictionary) 219 { 220 fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status); 221 if (U_SUCCESS(status)) { 222 setCharacters(fThaiWordSet); 223 } 224 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status); 225 fMarkSet.add(0x0020); 226 fEndWordSet = fThaiWordSet; 227 fEndWordSet.remove(0x0E31); // MAI HAN-AKAT 228 fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 229 fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK 230 fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 231 fSuffixSet.add(THAI_PAIYANNOI); 232 fSuffixSet.add(THAI_MAIYAMOK); 233 234 // Compact for caching. 235 fMarkSet.compact(); 236 fEndWordSet.compact(); 237 fBeginWordSet.compact(); 238 fSuffixSet.compact(); 239 } 240 241 ThaiBreakEngine::~ThaiBreakEngine() { 242 delete fDictionary; 243 } 244 245 int32_t 246 ThaiBreakEngine::divideUpDictionaryRange( UText *text, 247 int32_t rangeStart, 248 int32_t rangeEnd, 249 UStack &foundBreaks ) const { 250 utext_setNativeIndex(text, rangeStart); 251 utext_moveIndex32(text, THAI_MIN_WORD_SPAN); 252 if (utext_getNativeIndex(text) >= rangeEnd) { 253 return 0; // Not enough characters for two words 254 } 255 utext_setNativeIndex(text, rangeStart); 256 257 258 uint32_t wordsFound = 0; 259 int32_t cpWordLength = 0; // Word Length in Code Points. 260 int32_t cuWordLength = 0; // Word length in code units (UText native indexing) 261 int32_t current; 262 UErrorCode status = U_ZERO_ERROR; 263 PossibleWord words[THAI_LOOKAHEAD]; 264 265 utext_setNativeIndex(text, rangeStart); 266 267 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { 268 cpWordLength = 0; 269 cuWordLength = 0; 270 271 // Look for candidate words at the current position 272 int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 273 274 // If we found exactly one, use that 275 if (candidates == 1) { 276 cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text); 277 cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength(); 278 wordsFound += 1; 279 } 280 // If there was more than one, see which one can take us forward the most words 281 else if (candidates > 1) { 282 // If we're already at the end of the range, we're done 283 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 284 goto foundBest; 285 } 286 do { 287 int32_t wordsMatched = 1; 288 if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { 289 if (wordsMatched < 2) { 290 // Followed by another dictionary word; mark first word as a good candidate 291 words[wordsFound%THAI_LOOKAHEAD].markCurrent(); 292 wordsMatched = 2; 293 } 294 295 // If we're already at the end of the range, we're done 296 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 297 goto foundBest; 298 } 299 300 // See if any of the possible second words is followed by a third word 301 do { 302 // If we find a third word, stop right away 303 if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { 304 words[wordsFound % THAI_LOOKAHEAD].markCurrent(); 305 goto foundBest; 306 } 307 } 308 while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text)); 309 } 310 } 311 while (words[wordsFound % THAI_LOOKAHEAD].backUp(text)); 312 foundBest: 313 // Set UText position to after the accepted word. 314 cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text); 315 cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength(); 316 wordsFound += 1; 317 } 318 319 // We come here after having either found a word or not. We look ahead to the 320 // next word. If it's not a dictionary word, we will combine it with the word we 321 // just found (if there is one), but only if the preceding word does not exceed 322 // the threshold. 323 // The text iterator should now be positioned at the end of the word we found. 324 325 UChar32 uc = 0; 326 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) { 327 // if it is a dictionary word, do nothing. If it isn't, then if there is 328 // no preceding word, or the non-word shares less than the minimum threshold 329 // of characters with a dictionary word, then scan to resynchronize 330 if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 331 && (cuWordLength == 0 332 || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) { 333 // Look for a plausible word boundary 334 int32_t remaining = rangeEnd - (current+cuWordLength); 335 UChar32 pc; 336 int32_t chars = 0; 337 for (;;) { 338 int32_t pcIndex = utext_getNativeIndex(text); 339 pc = utext_next32(text); 340 int32_t pcSize = utext_getNativeIndex(text) - pcIndex; 341 chars += pcSize; 342 remaining -= pcSize; 343 if (remaining <= 0) { 344 break; 345 } 346 uc = utext_current32(text); 347 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 348 // Maybe. See if it's in the dictionary. 349 // NOTE: In the original Apple code, checked that the next 350 // two characters after uc were not 0x0E4C THANTHAKHAT before 351 // checking the dictionary. That is just a performance filter, 352 // but it's not clear it's faster than checking the trie. 353 int32_t candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 354 utext_setNativeIndex(text, current + cuWordLength + chars); 355 if (candidates > 0) { 356 break; 357 } 358 } 359 } 360 361 // Bump the word count if there wasn't already one 362 if (cuWordLength <= 0) { 363 wordsFound += 1; 364 } 365 366 // Update the length with the passed-over characters 367 cuWordLength += chars; 368 } 369 else { 370 // Back up to where we were for next iteration 371 utext_setNativeIndex(text, current+cuWordLength); 372 } 373 } 374 375 // Never stop before a combining mark. 376 int32_t currPos; 377 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { 378 utext_next32(text); 379 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos; 380 } 381 382 // Look ahead for possible suffixes if a dictionary word does not follow. 383 // We do this in code rather than using a rule so that the heuristic 384 // resynch continues to function. For example, one of the suffix characters 385 // could be a typo in the middle of a word. 386 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cuWordLength > 0) { 387 if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 388 && fSuffixSet.contains(uc = utext_current32(text))) { 389 if (uc == THAI_PAIYANNOI) { 390 if (!fSuffixSet.contains(utext_previous32(text))) { 391 // Skip over previous end and PAIYANNOI 392 utext_next32(text); 393 int32_t paiyannoiIndex = utext_getNativeIndex(text); 394 utext_next32(text); 395 cuWordLength += utext_getNativeIndex(text) - paiyannoiIndex; // Add PAIYANNOI to word 396 uc = utext_current32(text); // Fetch next character 397 } 398 else { 399 // Restore prior position 400 utext_next32(text); 401 } 402 } 403 if (uc == THAI_MAIYAMOK) { 404 if (utext_previous32(text) != THAI_MAIYAMOK) { 405 // Skip over previous end and MAIYAMOK 406 utext_next32(text); 407 int32_t maiyamokIndex = utext_getNativeIndex(text); 408 utext_next32(text); 409 cuWordLength += utext_getNativeIndex(text) - maiyamokIndex; // Add MAIYAMOK to word 410 } 411 else { 412 // Restore prior position 413 utext_next32(text); 414 } 415 } 416 } 417 else { 418 utext_setNativeIndex(text, current+cuWordLength); 419 } 420 } 421 422 // Did we find a word on this iteration? If so, push it on the break stack 423 if (cuWordLength > 0) { 424 foundBreaks.push((current+cuWordLength), status); 425 } 426 } 427 428 // Don't return a break for the end of the dictionary range if there is one there. 429 if (foundBreaks.peeki() >= rangeEnd) { 430 (void) foundBreaks.popi(); 431 wordsFound -= 1; 432 } 433 434 return wordsFound; 435 } 436 437 /* 438 ****************************************************************** 439 * LaoBreakEngine 440 */ 441 442 // How many words in a row are "good enough"? 443 static const int32_t LAO_LOOKAHEAD = 3; 444 445 // Will not combine a non-word with a preceding dictionary word longer than this 446 static const int32_t LAO_ROOT_COMBINE_THRESHOLD = 3; 447 448 // Will not combine a non-word that shares at least this much prefix with a 449 // dictionary word, with a preceding word 450 static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = 3; 451 452 // Minimum word size 453 static const int32_t LAO_MIN_WORD = 2; 454 455 // Minimum number of characters for two words 456 static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2; 457 458 LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) 459 : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)), 460 fDictionary(adoptDictionary) 461 { 462 fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status); 463 if (U_SUCCESS(status)) { 464 setCharacters(fLaoWordSet); 465 } 466 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status); 467 fMarkSet.add(0x0020); 468 fEndWordSet = fLaoWordSet; 469 fEndWordSet.remove(0x0EC0, 0x0EC4); // prefix vowels 470 fBeginWordSet.add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters) 471 fBeginWordSet.add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent) 472 fBeginWordSet.add(0x0EC0, 0x0EC4); // prefix vowels 473 474 // Compact for caching. 475 fMarkSet.compact(); 476 fEndWordSet.compact(); 477 fBeginWordSet.compact(); 478 } 479 480 LaoBreakEngine::~LaoBreakEngine() { 481 delete fDictionary; 482 } 483 484 int32_t 485 LaoBreakEngine::divideUpDictionaryRange( UText *text, 486 int32_t rangeStart, 487 int32_t rangeEnd, 488 UStack &foundBreaks ) const { 489 if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) { 490 return 0; // Not enough characters for two words 491 } 492 493 uint32_t wordsFound = 0; 494 int32_t cpWordLength = 0; 495 int32_t cuWordLength = 0; 496 int32_t current; 497 UErrorCode status = U_ZERO_ERROR; 498 PossibleWord words[LAO_LOOKAHEAD]; 499 500 utext_setNativeIndex(text, rangeStart); 501 502 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { 503 cuWordLength = 0; 504 cpWordLength = 0; 505 506 // Look for candidate words at the current position 507 int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 508 509 // If we found exactly one, use that 510 if (candidates == 1) { 511 cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text); 512 cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength(); 513 wordsFound += 1; 514 } 515 // If there was more than one, see which one can take us forward the most words 516 else if (candidates > 1) { 517 // If we're already at the end of the range, we're done 518 if (utext_getNativeIndex(text) >= rangeEnd) { 519 goto foundBest; 520 } 521 do { 522 int32_t wordsMatched = 1; 523 if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { 524 if (wordsMatched < 2) { 525 // Followed by another dictionary word; mark first word as a good candidate 526 words[wordsFound%LAO_LOOKAHEAD].markCurrent(); 527 wordsMatched = 2; 528 } 529 530 // If we're already at the end of the range, we're done 531 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 532 goto foundBest; 533 } 534 535 // See if any of the possible second words is followed by a third word 536 do { 537 // If we find a third word, stop right away 538 if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { 539 words[wordsFound % LAO_LOOKAHEAD].markCurrent(); 540 goto foundBest; 541 } 542 } 543 while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text)); 544 } 545 } 546 while (words[wordsFound % LAO_LOOKAHEAD].backUp(text)); 547 foundBest: 548 cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text); 549 cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength(); 550 wordsFound += 1; 551 } 552 553 // We come here after having either found a word or not. We look ahead to the 554 // next word. If it's not a dictionary word, we will combine it withe the word we 555 // just found (if there is one), but only if the preceding word does not exceed 556 // the threshold. 557 // The text iterator should now be positioned at the end of the word we found. 558 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < LAO_ROOT_COMBINE_THRESHOLD) { 559 // if it is a dictionary word, do nothing. If it isn't, then if there is 560 // no preceding word, or the non-word shares less than the minimum threshold 561 // of characters with a dictionary word, then scan to resynchronize 562 if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 563 && (cuWordLength == 0 564 || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) { 565 // Look for a plausible word boundary 566 int32_t remaining = rangeEnd - (current + cuWordLength); 567 UChar32 pc; 568 UChar32 uc; 569 int32_t chars = 0; 570 for (;;) { 571 int32_t pcIndex = utext_getNativeIndex(text); 572 pc = utext_next32(text); 573 int32_t pcSize = utext_getNativeIndex(text) - pcIndex; 574 chars += pcSize; 575 remaining -= pcSize; 576 if (remaining <= 0) { 577 break; 578 } 579 uc = utext_current32(text); 580 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 581 // Maybe. See if it's in the dictionary. 582 // TODO: this looks iffy; compare with old code. 583 int32_t candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 584 utext_setNativeIndex(text, current + cuWordLength + chars); 585 if (candidates > 0) { 586 break; 587 } 588 } 589 } 590 591 // Bump the word count if there wasn't already one 592 if (cuWordLength <= 0) { 593 wordsFound += 1; 594 } 595 596 // Update the length with the passed-over characters 597 cuWordLength += chars; 598 } 599 else { 600 // Back up to where we were for next iteration 601 utext_setNativeIndex(text, current + cuWordLength); 602 } 603 } 604 605 // Never stop before a combining mark. 606 int32_t currPos; 607 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { 608 utext_next32(text); 609 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos; 610 } 611 612 // Look ahead for possible suffixes if a dictionary word does not follow. 613 // We do this in code rather than using a rule so that the heuristic 614 // resynch continues to function. For example, one of the suffix characters 615 // could be a typo in the middle of a word. 616 // NOT CURRENTLY APPLICABLE TO LAO 617 618 // Did we find a word on this iteration? If so, push it on the break stack 619 if (cuWordLength > 0) { 620 foundBreaks.push((current+cuWordLength), status); 621 } 622 } 623 624 // Don't return a break for the end of the dictionary range if there is one there. 625 if (foundBreaks.peeki() >= rangeEnd) { 626 (void) foundBreaks.popi(); 627 wordsFound -= 1; 628 } 629 630 return wordsFound; 631 } 632 633 /* 634 ****************************************************************** 635 * BurmeseBreakEngine 636 */ 637 638 // How many words in a row are "good enough"? 639 static const int32_t BURMESE_LOOKAHEAD = 3; 640 641 // Will not combine a non-word with a preceding dictionary word longer than this 642 static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3; 643 644 // Will not combine a non-word that shares at least this much prefix with a 645 // dictionary word, with a preceding word 646 static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3; 647 648 // Minimum word size 649 static const int32_t BURMESE_MIN_WORD = 2; 650 651 // Minimum number of characters for two words 652 static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2; 653 654 BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) 655 : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)), 656 fDictionary(adoptDictionary) 657 { 658 fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status); 659 if (U_SUCCESS(status)) { 660 setCharacters(fBurmeseWordSet); 661 } 662 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status); 663 fMarkSet.add(0x0020); 664 fEndWordSet = fBurmeseWordSet; 665 fBeginWordSet.add(0x1000, 0x102A); // basic consonants and independent vowels 666 667 // Compact for caching. 668 fMarkSet.compact(); 669 fEndWordSet.compact(); 670 fBeginWordSet.compact(); 671 } 672 673 BurmeseBreakEngine::~BurmeseBreakEngine() { 674 delete fDictionary; 675 } 676 677 int32_t 678 BurmeseBreakEngine::divideUpDictionaryRange( UText *text, 679 int32_t rangeStart, 680 int32_t rangeEnd, 681 UStack &foundBreaks ) const { 682 if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) { 683 return 0; // Not enough characters for two words 684 } 685 686 uint32_t wordsFound = 0; 687 int32_t cpWordLength = 0; 688 int32_t cuWordLength = 0; 689 int32_t current; 690 UErrorCode status = U_ZERO_ERROR; 691 PossibleWord words[BURMESE_LOOKAHEAD]; 692 693 utext_setNativeIndex(text, rangeStart); 694 695 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { 696 cuWordLength = 0; 697 cpWordLength = 0; 698 699 // Look for candidate words at the current position 700 int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 701 702 // If we found exactly one, use that 703 if (candidates == 1) { 704 cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text); 705 cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength(); 706 wordsFound += 1; 707 } 708 // If there was more than one, see which one can take us forward the most words 709 else if (candidates > 1) { 710 // If we're already at the end of the range, we're done 711 if (utext_getNativeIndex(text) >= rangeEnd) { 712 goto foundBest; 713 } 714 do { 715 int32_t wordsMatched = 1; 716 if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { 717 if (wordsMatched < 2) { 718 // Followed by another dictionary word; mark first word as a good candidate 719 words[wordsFound%BURMESE_LOOKAHEAD].markCurrent(); 720 wordsMatched = 2; 721 } 722 723 // If we're already at the end of the range, we're done 724 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 725 goto foundBest; 726 } 727 728 // See if any of the possible second words is followed by a third word 729 do { 730 // If we find a third word, stop right away 731 if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { 732 words[wordsFound % BURMESE_LOOKAHEAD].markCurrent(); 733 goto foundBest; 734 } 735 } 736 while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(text)); 737 } 738 } 739 while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text)); 740 foundBest: 741 cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text); 742 cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength(); 743 wordsFound += 1; 744 } 745 746 // We come here after having either found a word or not. We look ahead to the 747 // next word. If it's not a dictionary word, we will combine it withe the word we 748 // just found (if there is one), but only if the preceding word does not exceed 749 // the threshold. 750 // The text iterator should now be positioned at the end of the word we found. 751 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) { 752 // if it is a dictionary word, do nothing. If it isn't, then if there is 753 // no preceding word, or the non-word shares less than the minimum threshold 754 // of characters with a dictionary word, then scan to resynchronize 755 if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 756 && (cuWordLength == 0 757 || words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) { 758 // Look for a plausible word boundary 759 int32_t remaining = rangeEnd - (current + cuWordLength); 760 UChar32 pc; 761 UChar32 uc; 762 int32_t chars = 0; 763 for (;;) { 764 int32_t pcIndex = utext_getNativeIndex(text); 765 pc = utext_next32(text); 766 int32_t pcSize = utext_getNativeIndex(text) - pcIndex; 767 chars += pcSize; 768 remaining -= pcSize; 769 if (remaining <= 0) { 770 break; 771 } 772 uc = utext_current32(text); 773 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 774 // Maybe. See if it's in the dictionary. 775 // TODO: this looks iffy; compare with old code. 776 int32_t candidates = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 777 utext_setNativeIndex(text, current + cuWordLength + chars); 778 if (candidates > 0) { 779 break; 780 } 781 } 782 } 783 784 // Bump the word count if there wasn't already one 785 if (cuWordLength <= 0) { 786 wordsFound += 1; 787 } 788 789 // Update the length with the passed-over characters 790 cuWordLength += chars; 791 } 792 else { 793 // Back up to where we were for next iteration 794 utext_setNativeIndex(text, current + cuWordLength); 795 } 796 } 797 798 // Never stop before a combining mark. 799 int32_t currPos; 800 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { 801 utext_next32(text); 802 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos; 803 } 804 805 // Look ahead for possible suffixes if a dictionary word does not follow. 806 // We do this in code rather than using a rule so that the heuristic 807 // resynch continues to function. For example, one of the suffix characters 808 // could be a typo in the middle of a word. 809 // NOT CURRENTLY APPLICABLE TO BURMESE 810 811 // Did we find a word on this iteration? If so, push it on the break stack 812 if (cuWordLength > 0) { 813 foundBreaks.push((current+cuWordLength), status); 814 } 815 } 816 817 // Don't return a break for the end of the dictionary range if there is one there. 818 if (foundBreaks.peeki() >= rangeEnd) { 819 (void) foundBreaks.popi(); 820 wordsFound -= 1; 821 } 822 823 return wordsFound; 824 } 825 826 /* 827 ****************************************************************** 828 * KhmerBreakEngine 829 */ 830 831 // How many words in a row are "good enough"? 832 static const int32_t KHMER_LOOKAHEAD = 3; 833 834 // Will not combine a non-word with a preceding dictionary word longer than this 835 static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3; 836 837 // Will not combine a non-word that shares at least this much prefix with a 838 // dictionary word, with a preceding word 839 static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3; 840 841 // Minimum word size 842 static const int32_t KHMER_MIN_WORD = 2; 843 844 // Minimum number of characters for two words 845 static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2; 846 847 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) 848 : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)), 849 fDictionary(adoptDictionary) 850 { 851 fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status); 852 if (U_SUCCESS(status)) { 853 setCharacters(fKhmerWordSet); 854 } 855 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status); 856 fMarkSet.add(0x0020); 857 fEndWordSet = fKhmerWordSet; 858 fBeginWordSet.add(0x1780, 0x17B3); 859 //fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels 860 //fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word 861 //fEndWordSet.remove(0x17B2); // Khmer independent vowel that can't end a word 862 fEndWordSet.remove(0x17D2); // KHMER SIGN COENG that combines some following characters 863 //fEndWordSet.remove(0x17B6, 0x17C5); // Remove dependent vowels 864 // fEndWordSet.remove(0x0E31); // MAI HAN-AKAT 865 // fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 866 // fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK 867 // fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 868 // fSuffixSet.add(THAI_PAIYANNOI); 869 // fSuffixSet.add(THAI_MAIYAMOK); 870 871 // Compact for caching. 872 fMarkSet.compact(); 873 fEndWordSet.compact(); 874 fBeginWordSet.compact(); 875 // fSuffixSet.compact(); 876 } 877 878 KhmerBreakEngine::~KhmerBreakEngine() { 879 delete fDictionary; 880 } 881 882 int32_t 883 KhmerBreakEngine::divideUpDictionaryRange( UText *text, 884 int32_t rangeStart, 885 int32_t rangeEnd, 886 UStack &foundBreaks ) const { 887 if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) { 888 return 0; // Not enough characters for two words 889 } 890 891 uint32_t wordsFound = 0; 892 int32_t cpWordLength = 0; 893 int32_t cuWordLength = 0; 894 int32_t current; 895 UErrorCode status = U_ZERO_ERROR; 896 PossibleWord words[KHMER_LOOKAHEAD]; 897 898 utext_setNativeIndex(text, rangeStart); 899 900 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { 901 cuWordLength = 0; 902 cpWordLength = 0; 903 904 // Look for candidate words at the current position 905 int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 906 907 // If we found exactly one, use that 908 if (candidates == 1) { 909 cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text); 910 cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength(); 911 wordsFound += 1; 912 } 913 914 // If there was more than one, see which one can take us forward the most words 915 else if (candidates > 1) { 916 // If we're already at the end of the range, we're done 917 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 918 goto foundBest; 919 } 920 do { 921 int32_t wordsMatched = 1; 922 if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { 923 if (wordsMatched < 2) { 924 // Followed by another dictionary word; mark first word as a good candidate 925 words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); 926 wordsMatched = 2; 927 } 928 929 // If we're already at the end of the range, we're done 930 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 931 goto foundBest; 932 } 933 934 // See if any of the possible second words is followed by a third word 935 do { 936 // If we find a third word, stop right away 937 if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { 938 words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); 939 goto foundBest; 940 } 941 } 942 while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text)); 943 } 944 } 945 while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text)); 946 foundBest: 947 cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text); 948 cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength(); 949 wordsFound += 1; 950 } 951 952 // We come here after having either found a word or not. We look ahead to the 953 // next word. If it's not a dictionary word, we will combine it with the word we 954 // just found (if there is one), but only if the preceding word does not exceed 955 // the threshold. 956 // The text iterator should now be positioned at the end of the word we found. 957 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) { 958 // if it is a dictionary word, do nothing. If it isn't, then if there is 959 // no preceding word, or the non-word shares less than the minimum threshold 960 // of characters with a dictionary word, then scan to resynchronize 961 if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 962 && (cuWordLength == 0 963 || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) { 964 // Look for a plausible word boundary 965 int32_t remaining = rangeEnd - (current+cuWordLength); 966 UChar32 pc; 967 UChar32 uc; 968 int32_t chars = 0; 969 for (;;) { 970 int32_t pcIndex = utext_getNativeIndex(text); 971 pc = utext_next32(text); 972 int32_t pcSize = utext_getNativeIndex(text) - pcIndex; 973 chars += pcSize; 974 remaining -= pcSize; 975 if (remaining <= 0) { 976 break; 977 } 978 uc = utext_current32(text); 979 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 980 // Maybe. See if it's in the dictionary. 981 int32_t candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 982 utext_setNativeIndex(text, current+cuWordLength+chars); 983 if (candidates > 0) { 984 break; 985 } 986 } 987 } 988 989 // Bump the word count if there wasn't already one 990 if (cuWordLength <= 0) { 991 wordsFound += 1; 992 } 993 994 // Update the length with the passed-over characters 995 cuWordLength += chars; 996 } 997 else { 998 // Back up to where we were for next iteration 999 utext_setNativeIndex(text, current+cuWordLength); 1000 } 1001 } 1002 1003 // Never stop before a combining mark. 1004 int32_t currPos; 1005 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { 1006 utext_next32(text); 1007 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos; 1008 } 1009 1010 // Look ahead for possible suffixes if a dictionary word does not follow. 1011 // We do this in code rather than using a rule so that the heuristic 1012 // resynch continues to function. For example, one of the suffix characters 1013 // could be a typo in the middle of a word. 1014 // if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { 1015 // if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 1016 // && fSuffixSet.contains(uc = utext_current32(text))) { 1017 // if (uc == KHMER_PAIYANNOI) { 1018 // if (!fSuffixSet.contains(utext_previous32(text))) { 1019 // // Skip over previous end and PAIYANNOI 1020 // utext_next32(text); 1021 // utext_next32(text); 1022 // wordLength += 1; // Add PAIYANNOI to word 1023 // uc = utext_current32(text); // Fetch next character 1024 // } 1025 // else { 1026 // // Restore prior position 1027 // utext_next32(text); 1028 // } 1029 // } 1030 // if (uc == KHMER_MAIYAMOK) { 1031 // if (utext_previous32(text) != KHMER_MAIYAMOK) { 1032 // // Skip over previous end and MAIYAMOK 1033 // utext_next32(text); 1034 // utext_next32(text); 1035 // wordLength += 1; // Add MAIYAMOK to word 1036 // } 1037 // else { 1038 // // Restore prior position 1039 // utext_next32(text); 1040 // } 1041 // } 1042 // } 1043 // else { 1044 // utext_setNativeIndex(text, current+wordLength); 1045 // } 1046 // } 1047 1048 // Did we find a word on this iteration? If so, push it on the break stack 1049 if (cuWordLength > 0) { 1050 foundBreaks.push((current+cuWordLength), status); 1051 } 1052 } 1053 1054 // Don't return a break for the end of the dictionary range if there is one there. 1055 if (foundBreaks.peeki() >= rangeEnd) { 1056 (void) foundBreaks.popi(); 1057 wordsFound -= 1; 1058 } 1059 1060 return wordsFound; 1061 } 1062 1063 #if !UCONFIG_NO_NORMALIZATION 1064 /* 1065 ****************************************************************** 1066 * CjkBreakEngine 1067 */ 1068 static const uint32_t kuint32max = 0xFFFFFFFF; 1069 CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status) 1070 : DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) { 1071 // Korean dictionary only includes Hangul syllables 1072 fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status); 1073 fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status); 1074 fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status); 1075 fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status); 1076 nfkcNorm2 = Normalizer2::getNFKCInstance(status); 1077 1078 if (U_SUCCESS(status)) { 1079 // handle Korean and Japanese/Chinese using different dictionaries 1080 if (type == kKorean) { 1081 setCharacters(fHangulWordSet); 1082 } else { //Chinese and Japanese 1083 UnicodeSet cjSet; 1084 cjSet.addAll(fHanWordSet); 1085 cjSet.addAll(fKatakanaWordSet); 1086 cjSet.addAll(fHiraganaWordSet); 1087 cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK 1088 cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK 1089 setCharacters(cjSet); 1090 } 1091 } 1092 } 1093 1094 CjkBreakEngine::~CjkBreakEngine(){ 1095 delete fDictionary; 1096 } 1097 1098 // The katakanaCost values below are based on the length frequencies of all 1099 // katakana phrases in the dictionary 1100 static const int32_t kMaxKatakanaLength = 8; 1101 static const int32_t kMaxKatakanaGroupLength = 20; 1102 static const uint32_t maxSnlp = 255; 1103 1104 static inline uint32_t getKatakanaCost(int32_t wordLength){ 1105 //TODO: fill array with actual values from dictionary! 1106 static const uint32_t katakanaCost[kMaxKatakanaLength + 1] 1107 = {8192, 984, 408, 240, 204, 252, 300, 372, 480}; 1108 return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength]; 1109 } 1110 1111 static inline bool isKatakana(uint16_t value) { 1112 return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) || 1113 (value >= 0xFF66u && value <= 0xFF9fu); 1114 } 1115 1116 1117 // Function for accessing internal utext flags. 1118 // Replicates an internal UText function. 1119 1120 static inline int32_t utext_i32_flag(int32_t bitIndex) { 1121 return (int32_t)1 << bitIndex; 1122 } 1123 1124 1125 /* 1126 * @param text A UText representing the text 1127 * @param rangeStart The start of the range of dictionary characters 1128 * @param rangeEnd The end of the range of dictionary characters 1129 * @param foundBreaks Output of C array of int32_t break positions, or 0 1130 * @return The number of breaks found 1131 */ 1132 int32_t 1133 CjkBreakEngine::divideUpDictionaryRange( UText *inText, 1134 int32_t rangeStart, 1135 int32_t rangeEnd, 1136 UStack &foundBreaks ) const { 1137 if (rangeStart >= rangeEnd) { 1138 return 0; 1139 } 1140 1141 // UnicodeString version of input UText, NFKC normalized if necessary. 1142 UnicodeString inString; 1143 1144 // inputMap[inStringIndex] = corresponding native index from UText inText. 1145 // If NULL then mapping is 1:1 1146 LocalPointer<UVector32> inputMap; 1147 1148 UErrorCode status = U_ZERO_ERROR; 1149 1150 1151 // if UText has the input string as one contiguous UTF-16 chunk 1152 if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS)) && 1153 inText->chunkNativeStart <= rangeStart && 1154 inText->chunkNativeLimit >= rangeEnd && 1155 inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) { 1156 1157 // Input UText is in one contiguous UTF-16 chunk. 1158 // Use Read-only aliasing UnicodeString. 1159 inString.setTo(FALSE, 1160 inText->chunkContents + rangeStart - inText->chunkNativeStart, 1161 rangeEnd - rangeStart); 1162 } else { 1163 // Copy the text from the original inText (UText) to inString (UnicodeString). 1164 // Create a map from UnicodeString indices -> UText offsets. 1165 utext_setNativeIndex(inText, rangeStart); 1166 int32_t limit = rangeEnd; 1167 U_ASSERT(limit <= utext_nativeLength(inText)); 1168 if (limit > utext_nativeLength(inText)) { 1169 limit = utext_nativeLength(inText); 1170 } 1171 inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status); 1172 if (U_FAILURE(status)) { 1173 return 0; 1174 } 1175 while (utext_getNativeIndex(inText) < limit) { 1176 int32_t nativePosition = utext_getNativeIndex(inText); 1177 UChar32 c = utext_next32(inText); 1178 U_ASSERT(c != U_SENTINEL); 1179 inString.append(c); 1180 while (inputMap->size() < inString.length()) { 1181 inputMap->addElement(nativePosition, status); 1182 } 1183 } 1184 inputMap->addElement(limit, status); 1185 } 1186 1187 1188 if (!nfkcNorm2->isNormalized(inString, status)) { 1189 UnicodeString normalizedInput; 1190 // normalizedMap[normalizedInput position] == original UText position. 1191 LocalPointer<UVector32> normalizedMap(new UVector32(status), status); 1192 if (U_FAILURE(status)) { 1193 return 0; 1194 } 1195 1196 UnicodeString fragment; 1197 UnicodeString normalizedFragment; 1198 for (int32_t srcI = 0; srcI < inString.length();) { // Once per normalization chunk 1199 fragment.remove(); 1200 int32_t fragmentStartI = srcI; 1201 UChar32 c = inString.char32At(srcI); 1202 for (;;) { 1203 fragment.append(c); 1204 srcI = inString.moveIndex32(srcI, 1); 1205 if (srcI == inString.length()) { 1206 break; 1207 } 1208 c = inString.char32At(srcI); 1209 if (nfkcNorm2->hasBoundaryBefore(c)) { 1210 break; 1211 } 1212 } 1213 nfkcNorm2->normalize(fragment, normalizedFragment, status); 1214 normalizedInput.append(normalizedFragment); 1215 1216 // Map every position in the normalized chunk to the start of the chunk 1217 // in the original input. 1218 int32_t fragmentOriginalStart = inputMap.isValid() ? 1219 inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart; 1220 while (normalizedMap->size() < normalizedInput.length()) { 1221 normalizedMap->addElement(fragmentOriginalStart, status); 1222 if (U_FAILURE(status)) { 1223 break; 1224 } 1225 } 1226 } 1227 U_ASSERT(normalizedMap->size() == normalizedInput.length()); 1228 int32_t nativeEnd = inputMap.isValid() ? 1229 inputMap->elementAti(inString.length()) : inString.length()+rangeStart; 1230 normalizedMap->addElement(nativeEnd, status); 1231 1232 inputMap.moveFrom(normalizedMap); 1233 inString.moveFrom(normalizedInput); 1234 } 1235 1236 int32_t numCodePts = inString.countChar32(); 1237 if (numCodePts != inString.length()) { 1238 // There are supplementary characters in the input. 1239 // The dictionary will produce boundary positions in terms of code point indexes, 1240 // not in terms of code unit string indexes. 1241 // Use the inputMap mechanism to take care of this in addition to indexing differences 1242 // from normalization and/or UTF-8 input. 1243 UBool hadExistingMap = inputMap.isValid(); 1244 if (!hadExistingMap) { 1245 inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status); 1246 if (U_FAILURE(status)) { 1247 return 0; 1248 } 1249 } 1250 int32_t cpIdx = 0; 1251 for (int32_t cuIdx = 0; ; cuIdx = inString.moveIndex32(cuIdx, 1)) { 1252 U_ASSERT(cuIdx >= cpIdx); 1253 if (hadExistingMap) { 1254 inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx); 1255 } else { 1256 inputMap->addElement(cuIdx+rangeStart, status); 1257 } 1258 cpIdx++; 1259 if (cuIdx == inString.length()) { 1260 break; 1261 } 1262 } 1263 } 1264 1265 // bestSnlp[i] is the snlp of the best segmentation of the first i 1266 // code points in the range to be matched. 1267 UVector32 bestSnlp(numCodePts + 1, status); 1268 bestSnlp.addElement(0, status); 1269 for(int32_t i = 1; i <= numCodePts; i++) { 1270 bestSnlp.addElement(kuint32max, status); 1271 } 1272 1273 1274 // prev[i] is the index of the last CJK code point in the previous word in 1275 // the best segmentation of the first i characters. 1276 UVector32 prev(numCodePts + 1, status); 1277 for(int32_t i = 0; i <= numCodePts; i++){ 1278 prev.addElement(-1, status); 1279 } 1280 1281 const int32_t maxWordSize = 20; 1282 UVector32 values(numCodePts, status); 1283 values.setSize(numCodePts); 1284 UVector32 lengths(numCodePts, status); 1285 lengths.setSize(numCodePts); 1286 1287 UText fu = UTEXT_INITIALIZER; 1288 utext_openUnicodeString(&fu, &inString, &status); 1289 1290 // Dynamic programming to find the best segmentation. 1291 1292 // In outer loop, i is the code point index, 1293 // ix is the corresponding string (code unit) index. 1294 // They differ when the string contains supplementary characters. 1295 int32_t ix = 0; 1296 for (int32_t i = 0; i < numCodePts; ++i, ix = inString.moveIndex32(ix, 1)) { 1297 if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) { 1298 continue; 1299 } 1300 1301 int32_t count; 1302 utext_setNativeIndex(&fu, ix); 1303 count = fDictionary->matches(&fu, maxWordSize, numCodePts, 1304 NULL, lengths.getBuffer(), values.getBuffer(), NULL); 1305 // Note: lengths is filled with code point lengths 1306 // The NULL parameter is the ignored code unit lengths. 1307 1308 // if there are no single character matches found in the dictionary 1309 // starting with this charcter, treat character as a 1-character word 1310 // with the highest value possible, i.e. the least likely to occur. 1311 // Exclude Korean characters from this treatment, as they should be left 1312 // together by default. 1313 if ((count == 0 || lengths.elementAti(0) != 1) && 1314 !fHangulWordSet.contains(inString.char32At(ix))) { 1315 values.setElementAt(maxSnlp, count); // 255 1316 lengths.setElementAt(1, count++); 1317 } 1318 1319 for (int32_t j = 0; j < count; j++) { 1320 uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)values.elementAti(j); 1321 int32_t ln_j_i = lengths.elementAti(j) + i; 1322 if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) { 1323 bestSnlp.setElementAt(newSnlp, ln_j_i); 1324 prev.setElementAt(i, ln_j_i); 1325 } 1326 } 1327 1328 // In Japanese, 1329 // Katakana word in single character is pretty rare. So we apply 1330 // the following heuristic to Katakana: any continuous run of Katakana 1331 // characters is considered a candidate word with a default cost 1332 // specified in the katakanaCost table according to its length. 1333 1334 bool is_prev_katakana = false; 1335 bool is_katakana = isKatakana(inString.char32At(ix)); 1336 int32_t katakanaRunLength = 1; 1337 if (!is_prev_katakana && is_katakana) { 1338 int32_t j = inString.moveIndex32(ix, 1); 1339 // Find the end of the continuous run of Katakana characters 1340 while (j < inString.length() && katakanaRunLength < kMaxKatakanaGroupLength && 1341 isKatakana(inString.char32At(j))) { 1342 j = inString.moveIndex32(j, 1); 1343 katakanaRunLength++; 1344 } 1345 if (katakanaRunLength < kMaxKatakanaGroupLength) { 1346 uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength); 1347 if (newSnlp < (uint32_t)bestSnlp.elementAti(j)) { 1348 bestSnlp.setElementAt(newSnlp, j); 1349 prev.setElementAt(i, i+katakanaRunLength); // prev[j] = i; 1350 } 1351 } 1352 } 1353 is_prev_katakana = is_katakana; 1354 } 1355 utext_close(&fu); 1356 1357 // Start pushing the optimal offset index into t_boundary (t for tentative). 1358 // prev[numCodePts] is guaranteed to be meaningful. 1359 // We'll first push in the reverse order, i.e., 1360 // t_boundary[0] = numCodePts, and afterwards do a swap. 1361 UVector32 t_boundary(numCodePts+1, status); 1362 1363 int32_t numBreaks = 0; 1364 // No segmentation found, set boundary to end of range 1365 if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) { 1366 t_boundary.addElement(numCodePts, status); 1367 numBreaks++; 1368 } else { 1369 for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) { 1370 t_boundary.addElement(i, status); 1371 numBreaks++; 1372 } 1373 U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0); 1374 } 1375 1376 // Add a break for the start of the dictionary range if there is not one 1377 // there already. 1378 if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) { 1379 t_boundary.addElement(0, status); 1380 numBreaks++; 1381 } 1382 1383 // Now that we're done, convert positions in t_boundary[] (indices in 1384 // the normalized input string) back to indices in the original input UText 1385 // while reversing t_boundary and pushing values to foundBreaks. 1386 for (int32_t i = numBreaks-1; i >= 0; i--) { 1387 int32_t cpPos = t_boundary.elementAti(i); 1388 int32_t utextPos = inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart; 1389 // Boundaries are added to foundBreaks output in ascending order. 1390 U_ASSERT(foundBreaks.size() == 0 ||foundBreaks.peeki() < utextPos); 1391 foundBreaks.push(utextPos, status); 1392 } 1393 1394 // inString goes out of scope 1395 // inputMap goes out of scope 1396 return numBreaks; 1397 } 1398 #endif 1399 1400 U_NAMESPACE_END 1401 1402 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 1403 1404